Built with R version 4.2.2.

Setup

Libraries

# library(car)
library(cowplot)
library(data.table)
library(DESeq2)
library(DHARMa)
library(ggpubr)
library(grid)
library(gridExtra)
library(iNEXT)
library(knitr)
library(lmPerm)
library(MASS)
library(pscl)
# library(rcompanion)
library(tidyverse)
library(vegan)
library(viridis)

# library(devtools)
# install_github("eastmallingresearch/Metabarcoding_pipeline/scripts")
library(metafuncs)

Functions and constants

ALPHA =      0.1   # DESeq2 alpha value
OTUFILTER =  0.01  # Remove OTUs with proportion of total reads below value
READFILTER = 0.05  # Will remove samples with read sum below sample_median_reads*READFILTER 
PAIREDONLY = F     # Will remove the pair of samples which fail the readfilter - probably only useful for DESeq separated by type NOTE removes pairs before DESeq object is created   
TAXCONF =    0.80  # Sets the taxonomy confidence level to get "rank" in taxonomy files
TOPOTU =     10    # Number of Top OTUs for summary information
DIFFOTU =    200    # Number of Top OTUs for correlation analysis

# graphics
DEVICE =     "png"
DPI =        1200
WIDTH =      9
HEIGHT =     9

# Model design
FACTORS = c("Site", "Storage", "Scion")
DESIGN = y ~ Site + Storage + Scion
FULL_DESIGN = y ~ Site * Storage * Scion
canker_design = "Cankers ~ Site * Storage * Scion"
# colour blind palette
cbPalette <- c(
  "#000000", "#E69F00", "#56B4E9", "#009E73", 
  "#F0E442", "#0072B2", "#D55E00", "#CC79A7"
)

source("functions/metabarcoding.R")
source("functions/loadme.R")
source("functions/rarefaction.R")

Load data

Bacterial and fungal ASV (ZOTU) tables, sample metadata, and taxonomy files are loaded into named lists using the loadData function from Greg’s metafuncs package.

Site names are encoded as follows according to previous work:

metadata <- "sample_metadata.txt"

# Load data
ubiome_BAC <- loadData("data/BAC.zotu_table.txt",metadata,"data/zBAC.sintax.taxa",RHB="BAC")
ubiome_FUN <- loadData("data/FUN.zotu_table.txt",metadata,"data/zFUN.sintax.taxa",RHB="FUN")

# Change sites Avalon -> 1, Scripps -> 2, and WWF -> 3.
# Storage from planting date.
# No storage for December plantings, yes for March and April (4months).
mutate_factors <- function(data){
  data <- data %>%
    rename(location = site, Scion = cultivar) %>%
    mutate(
      Site = case_when(
        location == "Avalon" ~ 1,
        location == "Scripps" ~ 2,
        location == "WWF" ~ 3
      ) %>% as.factor(),
      Storage = case_when(
        planting_date %in% c("march", "april") ~ "yes",
        planting_date %in% c("dec") ~ "no"
      ) %>% as.factor(),
      Scion = as.factor(Scion)
    )
  return(data)
}

ubiome_BAC$colData <- mutate_factors(ubiome_BAC$colData)
ubiome_FUN$colData <- mutate_factors(ubiome_FUN$colData)

Global removals

# Sample "A2-7" removed due to missampling.
ubiome_BAC$colData <- ubiome_BAC$colData[!rownames(ubiome_BAC$colData) %in% "HMA27", ]
ubiome_BAC$countData <- ubiome_BAC$countData[, !colnames(ubiome_BAC$countData) %in% "HMA27"]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% "HMA27", ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% "HMA27"]

Filter samples and OTUs

Filtering taxa

Plantae taxa are filtered from fungal taxData. Chloroplast and Eukaryote taxa are filtered from bacterial taxData. Corresponding OTUs are removed from countData.

# Filter Plant, Chloroplast, and Eukaryote OTUs

# Fungi: Plantae OTUs
cat("Fungi:", length(grep("Plantae", ubiome_FUN$taxData$kingdom)), "Plantae OTUs\n")
# Fungi: 0 Plantae OTUs
# Bacteria: Chloroplast (Streptophyta) and Eukaryote OTUs
cat(
  "Bacteria:", length(grep("Streptophyta", ubiome_BAC$taxData$genus)), "Chloroplast OTUs;", 
  length(grep("Eukaryota", ubiome_BAC$taxData$kingdom)), "Eukaryote OTUs\n"
)
# Bacteria: 37 Chloroplast OTUs; 188 Eukaryote OTUs
# Filter Chloroplast and Eukaryote
filt <- rownames(
  ubiome_BAC$taxData[
    grepl("Streptophyta", ubiome_BAC$taxData$genus) & 
    as.numeric(ubiome_BAC$taxData$g_conf) >= TAXCONF,
  ]
)

filt <- c(filt, rownames(ubiome_BAC$taxData[grep("Eukaryota", ubiome_BAC$taxData$kingdom), ]))

cat("Bacteria: removing", length(filt), "OTUs")
# Bacteria: removing 198 OTUs
ubiome_BAC$taxData <- ubiome_BAC$taxData[!rownames(ubiome_BAC$taxData) %in% filt, ]
ubiome_BAC$countData <- ubiome_BAC$countData[!rownames(ubiome_BAC$countData) %in% filt, ]

Filtering samples

Plot rarefaction curves.

Remove samples with read count below 5 % of median.

invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))
rare_bac <- gfunc(countData, colData, "Bacteria")
# rare_bac <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Bacteria ZOTU")
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))
rare_fun <- gfunc(countData, colData, "Fungi")
# rare_fun <- gfunc(as.data.frame(counts(dds)), as.data.frame(colData(dds)), "Fungi ZOTU")

rarefaction_plots <- grid.arrange(
  rare_bac, rare_fun,
  left = textGrob(label = expression("log"[10] * " aligned sequences"), rot = 90),
  bottom = "ASV count", nrow = 2
)

ggsave(filename = "rarefaction_plots.png", plot = rarefaction_plots, path = "figures/")

rarefaction_plots

# Fungi
med <- median(colSums(ubiome_FUN$countData))
filt <- !colSums(ubiome_FUN$countData) > med * READFILTER
cat("Fungi: ",sum(filt),"sample(s) removed\n")

# Bacteria
med <- median(colSums(ubiome_BAC$countData))
filt <- !colSums(ubiome_BAC$countData) > med * READFILTER
cat("Bacteria: ",sum(filt),"sample(s) removed\n")

Filter ASVs

ASV read count

Number of ASVs which account for 50 %, 80 %, and 99 % of total reads.

asv_propotions <- function(countData, proportion){
  i <- sum(countData)
  y <- rowSums(countData)
  y <- y[order(y, decreasing = T)]
  asvs <- length(y[(cumsum(y) / i <= proportion)])
  return(asvs)
}

proportions <- c(0.5, 0.9, 0.99, 1)

top_asvs <- data.table(
  "proportion" = proportions,
  "Fungi" = lapply(proportions, function(x) asv_propotions(ubiome_FUN$countData, x)),
  "Bacteria" = lapply(proportions, function(x) asv_propotions(ubiome_BAC$countData, x))
)

top_asvs
#    proportion Fungi Bacteria
# 1:       0.50    10      169
# 2:       0.90   171     2186
# 3:       0.99   995     5883
# 4:       1.00  2401     7265

Filter ASVs

Remove ASVs with read count below 1 % of total reads.

# Fungi
keep <- filter_otus(ubiome_FUN$countData, OTUFILTER)
cat("Fungi: removing", nrow(ubiome_FUN$countData) - length(keep), "OTUs\n")
# Fungi: removing 1406 OTUs
ubiome_FUN$taxData <- ubiome_FUN$taxData[rownames(ubiome_FUN$taxData) %in% keep,]
ubiome_FUN$countData <- ubiome_FUN$countData[rownames(ubiome_FUN$countData) %in% keep,]

# Bacteria
keep <-  filter_otus(ubiome_BAC$countData, OTUFILTER)
cat("Bacteria: removing", nrow(ubiome_BAC$countData) - length(keep), "OTUs")
# Bacteria: removing 1382 OTUs
ubiome_BAC$taxData <- ubiome_BAC$taxData[rownames(ubiome_BAC$taxData) %in% keep,]
ubiome_BAC$countData <- ubiome_BAC$countData[rownames(ubiome_BAC$countData) %in% keep,]

Absolute abundance normalisation

OTU normalisation is performed using qPCR theoretical copy number data. Copy number is calculated per mg of root sample from the qPCR data.

Prepare qPCR abundance data

abundance <- fread("mean_abundance.csv")

# Add sample ID to abundance data
abundance$id <- paste0("HM", gsub("-", "", abundance$Sample))
# abundance$id <- abundance$Sample
abundance$copy_number <- abundance$MeanAdjustedTCN_mg
abundance$log_copy_number <- log10(abundance$copy_number)

# Add bacterial (16S) and fungal (ITS) abundance to ubiome BAC and FUN named lists
ubiome_FUN$abundance <- abundance[abundance$Target == "ITS"] %>%
  column_to_rownames(var = "id")
ubiome_BAC$abundance <- abundance[abundance$Target == "16S"] %>%
  column_to_rownames(var = "id")

# Merge copy number from abundance with colData
ubiome_FUN$colData <- merge(
  ubiome_FUN$colData, 
  ubiome_FUN$abundance[, c("Target", "copy_number", "log_copy_number")], 
  by = 0
) %>% column_to_rownames(var = "Row.names")
ubiome_BAC$colData <- merge(
  ubiome_BAC$colData, 
  ubiome_BAC$abundance[, c("Target", "copy_number", "log_copy_number")], 
  by = 0
) %>% column_to_rownames(var = "Row.names")

Remove outliers

# Detect outliers with std > threshold from the median
detect_outliers <- function(x, val, threshold, na.rm = TRUE) {
  med_x <- median(x[[val]], na.rm = na.rm)
  sd_x <- sd(x[[val]], na.rm = na.rm)
  outliers <- x[x[[val]] > (med_x + threshold * sd_x) | x[[val]] < (med_x - threshold * sd_x), ]
  return(outliers)
}

outliers_FUN <- detect_outliers(ubiome_FUN$abundance, "MeanAdjustedTCN_mg", 3)
outliers_BAC <- detect_outliers(ubiome_BAC$abundance, "MeanAdjustedTCN_mg", 3)

# Remove samples with copy number > 3 std from the median
outliers <- rownames(outliers_FUN)
ubiome_FUN$abundance <- ubiome_FUN$abundance[!rownames(ubiome_FUN$abundance) %in% outliers, ]
ubiome_FUN$countData <- ubiome_FUN$countData[, !colnames(ubiome_FUN$countData) %in% outliers]
ubiome_FUN$colData <- ubiome_FUN$colData[!rownames(ubiome_FUN$colData) %in% outliers, ]

cat("Fungi: removing", length(outliers), "outlier(s)\n")
# Fungi: removing 1 outlier(s)

Sample A1-3 is removed from the fungal data due to abnormally high copy number.

Canker count data

Canker count data for sampled trees only.

# Canker count data for sampled trees only

canker_data <- fread("canker_data.csv", select = c(1:5, 7:34))

# Remove spaces from column names and convert to lowercase
colnames(canker_data) <- tolower(gsub(" ", "_", colnames(canker_data)))

# Codify site names, add storage and total canker count for timepoint 4
canker_data <- mutate(
  canker_data,
  Site = case_when(
    site == "Avalon" ~ 1,
    site == "Scripps" ~ 2,
    site == "WWF" ~ 3
  ) %>% as.factor(),
  Storage = case_when(
    planting_date %in% c("March", "April") ~ "yes",
    planting_date %in% c("Dec") ~ "no"
  ),
  Scion = as.factor(cultivar),
  total_cankers = a4 + b4 + c4 + d4 + e4
)

# Identify samples with missing values
missing <- unique(canker_data[!complete.cases(canker_data), code])

# Also remove sample A2-7 due to missampling
missing <- c(missing, "HMA27")

# Remove missing samples from canker data
canker_data <- canker_data[!canker_data$code %in% missing, ]

# Verify that there are two trees for each sample
canker_data %>% group_by(code) %>% summarise(n = n()) %>% filter(n != 2)
# # A tibble: 0 × 2
# # … with 2 variables: code <chr>, n <int>
# Sum of total cankers for each pair of trees with matching code
cankers <- canker_data %>% 
  group_by(code) %>% 
  summarise(
    Site = first(Site),
    Storage = first(Storage),
    Scion = first(Scion),
    Cankers = sum(total_cankers)
  ) %>% 
  column_to_rownames("code")

# Add total canker count to colData for both FUN and BAC
ubiome_FUN$colData <- merge(
  ubiome_FUN$colData, 
  cankers["Cankers"], 
  by = 0,
  all.x = T
) %>% column_to_rownames("Row.names")

ubiome_BAC$colData <- merge(
  ubiome_BAC$colData, 
  cankers["Cankers"], 
  by = 0,
  all.x = T
) %>% column_to_rownames("Row.names")

Summary stats

# png("figures/hist.png", width = 800, height = 600)
# hist(cankers$Cankers, breaks = 20, main = "Total canker count", xlab = "Total canker count")
# dev.off()

cankers_hist <- ggdensity(
  cankers, x = "Cankers", fill = "Site", facet.by = "Site", ncol = 1,
  add = "mean", rug = T, palette = cbPalette,
  title = "Total canker count", xlab = "Total canker count"
)

cankers_hist

ggsave(filename = "cankers_hist.png", plot = cankers_hist, path = "figures/")

cankers_box <- ggboxplot(
  cankers, x = "Site", y = "Cankers", palette = cbPalette,
  color = "Scion", add = "jitter", legend = "top", 
  title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)

cankers_box

ggsave(filename = "cankers_box.png", plot = cankers_box, path = "figures/")

cankers_bar <- ggbarplot(
  cankers, x = "Site", y = "Cankers", fill = "Scion", 
  palette = cbPalette, add = "mean_se", position = position_dodge(0.8),
  title = "Total canker count", xlab = "Site", ylab = "Total canker count"
)

cankers_bar

ggsave(filename = "cankers_bar.png", plot = cankers_bar, path = "figures/")

GLM

# Effect of Site, Scion, and Storage on canker count

# Formula
formula <- update(FULL_DESIGN, Cankers ~ .)
# formula <- Cankers ~ Site + Storage + Scion + site:Storage + site:Scion + Storage:Scion

# Log-linear model
canker_lm <- lm(update(FULL_DESIGN, log(Cankers + 1) ~ .), data = cankers)

par(mfrow = c(2, 2))
plot(canker_lm)

# Residual checking
res <- resid(canker_lm, type = "pearson")

# Poisson model
canker_poisson <- glm(formula, data = cankers, family = "poisson")

poisson_plot <- plot(simulateResiduals(canker_poisson), title = "Poisson model")

# Model overdispersed

# Negative binomial model
canker_negbin <- glm.nb(formula, data = cankers)

sim <- simulateResiduals(canker_negbin)

plot(sim, title = "Negative binomial model")

# canker_model_plots <- ggarrange(lm_plot, poisson_plot, negbin_plot, ncol = 3)

# ggsave(filename = "canker_model_plots.png", plot = canker_model_plots, path = "figures/")

# png("figures/canker_residuals.png", width = 800, height = 600)
# plot(sim)
# dev.off()

testZeroInflation(sim)
# 
#   DHARMa zero-inflation test via comparison to expected zeros with
#   simulation under H0 = fitted model
# 
# data:  simulationOutput
# ratioObsSim = 0.77399, p-value = 0.76
# alternative hypothesis: two.sided
nagelkerke(canker_negbin)
# Error in nagelkerke(canker_negbin): could not find function "nagelkerke"
# Model good fit

canker_anova <- anova(canker_negbin, test = "Chisq") %>% data.frame()
total_deviance <- sum(canker_anova$Deviance, na.rm = T) + tail(canker_anova$Resid..Dev, 1)
canker_anova$Perc..Dev <- canker_anova$Deviance / total_deviance * 100

canker_anova
#                    Df     Deviance Resid..Df Resid..Dev     Pr..Chi.
# NULL               NA           NA        73  314.98196           NA
# Site                2 118.75744595        71  196.22452 1.629852e-26
# Storage             1   0.02066259        70  196.20386 8.857019e-01
# Scion               6  24.29375368        64  171.91010 4.611042e-04
# Site:Storage        2  28.86115598        62  143.04895 5.406044e-07
# Site:Scion         12  31.65405909        50  111.39489 1.564383e-03
# Storage:Scion       6   7.40666778        44  103.98822 2.848693e-01
# Site:Storage:Scion 11  26.62949240        33   77.35873 5.225415e-03
#                       Perc..Dev
# NULL                         NA
# Site               37.702935303
# Storage             0.006559927
# Scion               7.712744377
# Site:Storage        9.162796386
# Site:Scion         10.049483066
# Storage:Scion       2.351457744
# Site:Storage:Scion  8.454291190

Create DESeq objects

# Make sure countData and colData still match, if they do, create DESeq objects, if not throw error
if(identical(colnames(ubiome_FUN$countData), rownames(ubiome_FUN$colData))) {
  # Create DESeq object
  ubiome_FUN$dds <- ubiom_to_des(ubiome_FUN)
  print("FUN DESeq object created")
} else {
  stop("FUN countData and colData do not match")
}
# [1] "FUN DESeq object created"
if(identical(colnames(ubiome_BAC$countData), rownames(ubiome_BAC$colData))) {
  # Create DESeq object
  ubiome_BAC$dds <- ubiom_to_des(ubiome_BAC)
  print("BAC DESeq object created")
} else {
  stop("BAC countData and colData do not match")
}
# [1] "BAC DESeq object created"

Abundance normalisation

Absolute abundance normalisation using DESeq2 size factors.

Values are centred around the mean of the copy number.

# Normalise count data using DESeq2 size factors

ubiome_FUN$dds$sizeFactor <- ubiome_FUN$dds$copy_number / mean(ubiome_FUN$dds$copy_number)
ubiome_BAC$dds$sizeFactor <- ubiome_BAC$dds$copy_number / mean(ubiome_BAC$dds$copy_number)
# Save environment
save.image("data_loaded.RData")

Fungi

# Unpack fungi data
invisible(mapply(assign, names(ubiome_FUN), ubiome_FUN, MoreArgs = list(envir = globalenv())))

OTU and sample summary

Read and sample summary

cat(
  "Raw reads", "\n\n",
  "Total raw reads:\t\t", sum(countData), "\n",
  "Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
  "Median raw reads per sample:\t", median(colSums(countData)), "\n",
  "Max raw reads per sample:\t", max(colSums(countData)), "\n",
  "Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads 
# 
#  Total raw reads:      7293776 
#  Mean raw reads per sample:    90046.62 
#  Median raw reads per sample:  93435 
#  Max raw reads per sample:     113518 
#  Min raw reads per sample:     38472
#colSums(countData)

nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
  "Total normalised reads:\t\t", sum(nct), "\n",
  "Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
  "Median normalised reads per sample:\t", median(colSums(nct)), "\n",
  "Min normalised reads per sample:\t", min(colSums(nct)), "\n",
  "Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads 
# 
#  Total normalised reads:       12468857 
#  Mean normalised reads per sample:     153936.5 
#  Median normalised reads per sample:   98624.28 
#  Min normalised reads per sample:  28901.7 
#  Max normalised reads per sample:  881441.3
#round(colSums(counts(dds,normalize = T)),0)

OTU summary

cat(
  "Total OTUs:\t\t", nrow(taxData),"\n\n",
  "Raw reads per OTU summary", "\n\n",
  "Mean raw reads per OTU:\t", mean(rowSums(countData)),"\n",
  "Median raw per OTU:\t\t", median(rowSums(countData)),"\n",
  "OTU raw Min reads:\t\t", min(rowSums(countData)),"\n",
  "OTU raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total OTUs:        995 
# 
#  Raw reads per OTU summary 
# 
#  Mean raw reads per OTU:   7330.428 
#  Median raw per OTU:       588 
#  OTU raw Min reads:        115 
#  OTU raw Max reads:        714327
cat(
  "Normalised reads per OTU summary","\n\n",
  "Mean normalised reads per OTU:\t\t", mean(rowSums(nct)),"\n",
  "Median normalised reads per OTU:\t", median(rowSums(nct)),"\n",
  "OTU normalised Min reads:\t\t", min(rowSums(nct)),"\n",
  "OTU normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per OTU summary 
# 
#  Mean normalised reads per OTU:        12531.51 
#  Median normalised reads per OTU:  1025.725 
#  OTU normalised Min reads:         101.2814 
#  OTU normalised Max reads:         1509459
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y / sum(y)

cat("Top " ,TOPOTU, "OTUs:\n")
# Top  10 OTUs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
#          counts proportion                          rank
# OTU2  1509458.8 0.12105832                 Ascomycota(p)
# OTU1  1490469.5 0.11953538 Dactylonectria macrodidyma(s)
# OTU5  1068164.1 0.08566657              Leotiomycetes(c)
# OTU4  1059908.0 0.08500442                 Ascomycota(p)
# OTU3   480660.1 0.03854885    Ilyonectria destructans(s)
# OTU7   290896.6 0.02332985                   Fusarium(g)
# OTU6   227927.9 0.01827978        Ilyonectria robusta(s)
# OTU9   201690.6 0.01617555                 Ascomycota(p)
# OTU8   191083.5 0.01532486                   Fusarium(g)
# OTU11  131684.2 0.01056105      Truncatella angustata(s)

Taxonomy Summary

Taxonomy identifiable

Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank.

# Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank

tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]

tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
  "0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
  "0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
#       rank  0.8 0.65  0.5
# 1: kingdom 1.00 1.00 1.00
# 2:  phylum 0.84 0.87 0.90
# 3:   class 0.70 0.74 0.78
# 4:   order 0.54 0.60 0.64
# 5:  family 0.42 0.45 0.49
# 6:   genus 0.38 0.42 0.47
# 7: species 0.24 0.30 0.35

% of reads which can be assigned to each taxonomic ranks

tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
  "0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
  "0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
#       rank    0.8   0.65    0.5
# 1: kingdom 100.00 100.00 100.00
# 2:  phylum  84.14  96.59  96.83
# 3:   class  60.12  70.92  71.47
# 4:   order  53.49  58.87  68.76
# 5:  family  44.97  46.80  50.25
# 6:   genus  46.06  48.01  50.72
# 7: species  30.44  36.70  41.62

Taxonomy plots

Plots of proportion of normalised reads assigned to members of phylum and class.

dat <- list(as.data.frame(counts(dds, normalize = T)), taxData, as.data.frame(colData(dds)))

design <- c("Site", "Storage")

# md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, cutoff = 0.1)
md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "phylum", cutoff = 0.1)

md1[, Site := factor(Site, levels = c(1, 2, 3))]
md1[, Storage := factor(Storage, levels = c("no", "yes"))]
md1[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md1[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md1 <- md1[!taxon %in% removals, ]

fun_phylum_plot <- plotfun1(md1, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/fun_phylum.png", fun_phylum_plot, width = 25, height = 15, units = "cm")

fun_phylum_plot

md2 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "class", cutoff = 0.1)

md2[, Site := factor(Site, levels = c(1, 2, 3))]
md2[, Storage := factor(Storage, levels = c("no", "yes"))]
md2[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md2[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md2 <- md2[!taxon %in% removals, ]

fun_class_plot <- plotfun1(md2, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/fun_class.png", fun_class_plot, width = 25, height = 15, units = "cm")

fun_class_plot

Abundance

Plot copy number for each sample grouped by site, Scion, and Storage. Test the effect of site, Scion, and Storage on copy number using ANOVA.

# abundance_plot <- ggplot(
#   data = as.data.frame(colData(dds)), 
#   aes(x = site, y = log_copy_number, colour = Scion, shape = Storage)
# ) + geom_jitter() + 
#   scale_colour_manual(values = cbPalette)

fun_abundance_box <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Fungal abundance", xlab = "Site", ylab = "log10 copy number"
)

ggsave(
  filename = "fun_abundance.png", plot = fun_abundance_box, path = "figures/", 
  height = 20, width = 20, units = "cm"
)

fun_abundance_box

fun_abundance_bar <- ggbarplot(
  data = as.data.frame(colData(dds)), x = "Storage", y = "log_copy_number", 
  fill = "Site", add = "mean_se", 
  palette = cbPalette, position = position_dodge(0.8),
  title = "(a) Fungal abundance", xlab = "Storage ", ylab = "Mean copy number (log10)"
) + guides(fill = guide_legend(title = "Site"))

ggsave(
  filename = "fun_abundance_bar.png", plot = fun_abundance_bar, path = "figures/", 
  height = 20, width = 20, units = "cm"
)

fun_abundance_bar

# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)

abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))

# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)

png("figures/fun_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png 
#   2
# Results
summary(abundance_anova)
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2  0.861  0.4306   4.663 0.0153 *
# Storage             1  0.501  0.5012   5.427 0.0251 *
# Scion               6  0.477  0.0795   0.860 0.5324  
# Site:Storage        2  0.683  0.3415   3.698 0.0338 *
# Site:Scion         12  1.203  0.1003   1.086 0.3981  
# Storage:Scion       6  0.458  0.0763   0.827 0.5564  
# Site:Storage:Scion 12  0.918  0.0765   0.828 0.6214  
# Residuals          39  3.602  0.0924                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100

abundance_results
#                    Df    Sum.Sq    Mean.Sq   F.value     Pr..F.  Perc.Var
# Site                2 0.8612429 0.43062144 4.6626824 0.01528779  9.895484
# Storage             1 0.5012156 0.50121559 5.4270616 0.02509695  5.758852
# Scion               6 0.4767020 0.07945034 0.8602723 0.53239080  5.477198
# Site:Storage        2 0.6830842 0.34154212 3.6981494 0.03383255  7.848482
# Site:Scion         12 1.2031665 0.10026388 1.0856371 0.39812911 13.824108
# Storage:Scion       6 0.4580790 0.07634649 0.8266645 0.55642162  5.263223
# Site:Storage:Scion 12 0.9180631 0.07650525 0.8283835 0.62137102 10.548335
# Residuals          39 3.6018401 0.09235487        NA         NA 41.384319

Alpha diversity analysis

Alpha diversity plot

# plot alpha diversity - plot_alpha will convert normalised abundances to integer values

fun_alpha_plot <- plot_alpha(
  counts(dds, normalize = F), colData(dds),
  design = "Scion", colour = "Site",
  measures = c("Shannon", "Simpson"),
  type = "bar"
) + scale_colour_manual(values = cbPalette) + 
  theme(axis.title.x = element_blank()) +
  ggtitle("Fungal α-diversity")

ggsave(
  filename = "fun_alpha.png", plot = fun_alpha_plot, path = "figures/", 
  height = 20, width = 40, units = "cm"
)
# Error in `geom_errorbar()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 2nd layer.
# Caused by error in `FUN()`:
# ! object 'se' not found
fun_alpha_plot
# Error in `geom_errorbar()`:
# ! Problem while computing aesthetics.
# ℹ Error occurred in the 2nd layer.
# Caused by error in `FUN()`:
# ! object 'se' not found

Permutation based anova on diversity index ranks

# get the diversity index data
all_alpha_ord <- plot_alpha(
  counts(dds, normalize = F),
  colData(dds),
  returnData = T
)

# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
  as.data.table(colData(dds), keep.rownames = "Samples"), 
  on = "Samples"
]

fun_alpha <- all_alpha_ord

formula <- FULL_DESIGN # x ~ Site * Storage * Scion + Site / Site.block

Chao1

setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  11554.5    5777.2 5000  < 2e-16 ***
# Storage             1   2056.4    2056.4 2137  0.04492 *  
# Site:Storage        2    812.4     406.2  221  0.68326    
# Scion               6    875.7     145.9  243  0.93416    
# Site:Scion         12   2817.7     234.8  904  0.93473    
# Storage:Scion       6   2046.5     341.1  762  0.77690    
# Site:Storage:Scion 12   2735.3     227.9 1039  0.92974    
# Residuals          39  21381.5     548.2                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df   R.Sum.Sq R.Mean.Sq Iter   Pr.Prob.  Perc.Var
# Site                2 11554.4684 5777.2342 5000 0.00000000 26.094102
# Storage             1  2056.4245 2056.4245 2137 0.04492279  4.644139
# Site:Storage        2   812.3544  406.1772  221 0.68325792  1.834585
# Scion               6   875.6763  145.9461  243 0.93415638  1.977589
# Site:Scion         12  2817.7201  234.8100  904 0.93473451  6.363415
# Storage:Scion       6  2046.5389  341.0898  762 0.77690289  4.621813
# Site:Storage:Scion 12  2735.3173  227.9431 1039 0.92974013  6.177320
# Residuals          39 21381.5000  548.2436   NA         NA 48.287037

Shannon

setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  12291.8    6145.9 5000  < 2e-16 ***
# Storage             1   1077.6    1077.6  103  0.49515    
# Site:Storage        2   2320.4    1160.2 1106  0.08318 .  
# Scion               6    570.9      95.1  278  1.00000    
# Site:Scion         12   3082.1     256.8  512  0.82617    
# Storage:Scion       6   2730.6     455.1 2562  0.44067    
# Site:Storage:Scion 12   5311.1     442.6 1166  0.52316    
# Residuals          39  16895.5     433.2                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df   R.Sum.Sq  R.Mean.Sq Iter   Pr.Prob.  Perc.Var
# Site                2 12291.7504 6145.87520 5000 0.00000000 27.759147
# Storage             1  1077.5749 1077.57489  103 0.49514563  2.433548
# Site:Storage        2  2320.4220 1160.21098 1106 0.08318264  5.240339
# Scion               6   570.8675   95.14458  278 1.00000000  1.289222
# Site:Scion         12  3082.1462  256.84552  512 0.82617188  6.960583
# Storage:Scion       6  2730.6068  455.10114 2562 0.44067135  6.166682
# Site:Storage:Scion 12  5311.1323  442.59435 1166 0.52315609 11.994427
# Residuals          39 16895.5000  433.21795   NA         NA 38.156052

Simpson

setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  12937.5    6468.8 5000  < 2e-16 ***
# Storage             1    764.2     764.2  675  0.13037    
# Site:Storage        2   2484.2    1242.1 2792  0.05337 .  
# Scion               6   1188.1     198.0  380  0.82632    
# Site:Scion         12   2027.8     169.0  293  0.86007    
# Storage:Scion       6   2529.6     421.6 2377  0.36853    
# Site:Storage:Scion 12   5334.6     444.6  778  0.44730    
# Residuals          39  17014.0     436.3                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df   R.Sum.Sq R.Mean.Sq Iter   Pr.Prob.  Perc.Var
# Site                2 12937.5044 6468.7522 5000 0.00000000 29.217490
# Storage             1   764.2007  764.2007  675 0.13037037  1.725837
# Site:Storage        2  2484.2454 1242.1227 2792 0.05336676  5.610310
# Scion               6  1188.0542  198.0090  380 0.82631579  2.683049
# Site:Scion         12  2027.7819  168.9818  293 0.86006826  4.579453
# Storage:Scion       6  2529.5940  421.5990 2377 0.36853176  5.712724
# Site:Storage:Scion 12  5334.6194  444.5516  778 0.44730077 12.047469
# Residuals          39 17014.0000  436.2564   NA         NA 38.423668

Beta diversity PCA/NMDS

PCA

# Number of PCs to include
n_pcs <- 20

# Perform PC decomposition of DES object
mypca <- des_to_pca(dds)

# To get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
fun_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))

formula = FULL_DESIGN

Percent variation in first 20 PCs

# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
  cumulative = cumsum(mypca$percentVar * 100),
  no = 1:length(mypca$percentVar)
)

# Plot cumulative percentage of variance explained
fun_cum_pca <- ggline(
  pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
  xlab = "Number of PCs", ylab = "Cumulative % variance explained",
  title = "Fungi: cumulative % variance explained by PCs"
)
ggsave(filename = "fun_cum_pca.png", plot = fun_cum_pca, path = "figures/",)
fun_cum_pca

pca_var <- data.frame(
  row.names = paste0("PC", 1:n_pcs),
  perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)

pca_var
#      perc_var
# PC1      27.1
# PC2      21.2
# PC3       8.4
# PC4       4.2
# PC5       3.3
# PC6       2.6
# PC7       1.9
# PC8       1.8
# PC9       1.7
# PC10      1.6
# PC11      1.3
# PC12      1.3
# PC13      1.2
# PC14      1.0
# PC15      1.0
# PC16      0.9
# PC17      0.9
# PC18      0.9
# PC19      0.8
# PC20      0.8

ANOVA of first 20 PCs

pca_summary <- apply(
  mypca$x[, 1:n_pcs], 2, 
  function(x){
    summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
  }
)

pca_summary
# $PC1
#                    Df Sum Sq Mean Sq F value  Pr(>F)    
# Site                2  71073   35537 238.166 < 2e-16 ***
# Storage             1     31      31   0.208 0.65102    
# Scion               6    871     145   0.973 0.45635    
# Site:Storage        2   2232    1116   7.480 0.00178 ** 
# Site:Scion         12   1299     108   0.726 0.71791    
# Storage:Scion       6    696     116   0.777 0.59290    
# Site:Storage:Scion 12   1781     148   0.994 0.47159    
# Residuals          39   5819     149                    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC2
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2  41657   20829  62.213 7.35e-13 ***
# Storage             1    411     411   1.228  0.27459    
# Scion               6    125      21   0.062  0.99892    
# Site:Storage        2   5403    2701   8.069  0.00117 ** 
# Site:Scion         12   2269     189   0.565  0.85649    
# Storage:Scion       6    416      69   0.207  0.97246    
# Site:Storage:Scion 12   2080     173   0.518  0.89032    
# Residuals          39  13057     335                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC3
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2   2091  1045.4   3.075 0.0575 .
# Storage             1    250   250.4   0.737 0.3960  
# Scion               6    976   162.7   0.479 0.8202  
# Site:Storage        2   1598   798.8   2.350 0.1087  
# Site:Scion         12   3182   265.2   0.780 0.6668  
# Storage:Scion       6   1399   233.2   0.686 0.6619  
# Site:Storage:Scion 12   3176   264.7   0.779 0.6682  
# Residuals          39  13257   339.9                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC4
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2   1692   846.2   5.486 0.00795 **
# Storage             1    156   155.8   1.010 0.32103   
# Scion               6    166    27.7   0.179 0.98087   
# Site:Storage        2   1216   608.2   3.943 0.02757 * 
# Site:Scion         12   1399   116.6   0.756 0.68963   
# Storage:Scion       6    637   106.1   0.688 0.66035   
# Site:Storage:Scion 12   1578   131.5   0.853 0.59855   
# Residuals          39   6016   154.2                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC5
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    238   119.1   0.739  0.484
# Storage             1    457   456.7   2.835  0.100
# Scion               6    326    54.4   0.338  0.913
# Site:Storage        2    313   156.3   0.970  0.388
# Site:Scion         12   1292   107.7   0.669  0.770
# Storage:Scion       6    534    88.9   0.552  0.765
# Site:Storage:Scion 12    680    56.7   0.352  0.973
# Residuals          39   6281   161.1               
# 
# $PC6
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    141   70.29   0.602  0.553
# Storage             1    279  278.51   2.384  0.131
# Scion               6    225   37.51   0.321  0.922
# Site:Storage        2    262  130.85   1.120  0.337
# Site:Scion         12    428   35.65   0.305  0.985
# Storage:Scion       6    582   96.92   0.830  0.554
# Site:Storage:Scion 12   1540  128.37   1.099  0.388
# Residuals          39   4557  116.85               
# 
# $PC7
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   44.9   22.47   0.283  0.755
# Storage             1   16.0   16.02   0.202  0.655
# Scion               6  689.5  114.92   1.450  0.221
# Site:Storage        2   13.3    6.67   0.084  0.919
# Site:Scion         12 1060.5   88.37   1.115  0.376
# Storage:Scion       6  265.0   44.16   0.557  0.761
# Site:Storage:Scion 12  838.4   69.87   0.881  0.572
# Residuals          39 3091.5   79.27               
# 
# $PC8
#                    Df Sum Sq Mean Sq F value Pr(>F)   
# Site                2    1.1     0.6   0.008 0.9919   
# Storage             1  425.5   425.5   6.057 0.0184 * 
# Scion               6  147.8    24.6   0.351 0.9051   
# Site:Storage        2  769.7   384.9   5.478 0.0080 **
# Site:Scion         12  613.2    51.1   0.727 0.7163   
# Storage:Scion       6  594.6    99.1   1.411 0.2352   
# Site:Storage:Scion 12  401.4    33.5   0.476 0.9166   
# Residuals          39 2739.9    70.3                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC9
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2   74.9    37.4   0.648   0.5287    
# Storage             1  210.8   210.8   3.647   0.0635 .  
# Scion               6  246.7    41.1   0.711   0.6425    
# Site:Storage        2 1500.6   750.3  12.982 4.77e-05 ***
# Site:Scion         12  418.6    34.9   0.604   0.8258    
# Storage:Scion       6  238.5    39.8   0.688   0.6606    
# Site:Storage:Scion 12  206.8    17.2   0.298   0.9860    
# Residuals          39 2254.0    57.8                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC10
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   17.2    8.62   0.107  0.898
# Storage             1   13.0   12.98   0.162  0.690
# Scion               6  279.3   46.55   0.580  0.744
# Site:Storage        2  112.7   56.35   0.702  0.502
# Site:Scion         12  544.6   45.38   0.566  0.856
# Storage:Scion       6   80.8   13.46   0.168  0.984
# Site:Storage:Scion 12  646.4   53.87   0.671  0.767
# Residuals          39 3129.1   80.23               
# 
# $PC11
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   13.2    6.58   0.090  0.915
# Storage             1    7.3    7.30   0.099  0.754
# Scion               6  170.3   28.38   0.386  0.883
# Site:Storage        2   54.8   27.42   0.373  0.691
# Site:Scion         12  328.5   27.38   0.373  0.966
# Storage:Scion       6  344.3   57.38   0.781  0.590
# Site:Storage:Scion 12  372.9   31.08   0.423  0.945
# Residuals          39 2865.9   73.48               
# 
# $PC12
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   55.1   27.54   0.547  0.583
# Storage             1  105.0  105.05   2.086  0.157
# Scion               6  196.0   32.66   0.648  0.691
# Site:Storage        2  144.5   72.25   1.434  0.251
# Site:Scion         12  722.8   60.24   1.196  0.320
# Storage:Scion       6  378.6   63.11   1.253  0.301
# Site:Storage:Scion 12  560.2   46.68   0.927  0.531
# Residuals          39 1964.5   50.37               
# 
# $PC13
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   18.7    9.37   0.150  0.861
# Storage             1   32.2   32.21   0.517  0.476
# Scion               6   76.6   12.76   0.205  0.973
# Site:Storage        2   92.4   46.18   0.742  0.483
# Site:Scion         12  127.3   10.61   0.170  0.999
# Storage:Scion       6  213.6   35.60   0.572  0.750
# Site:Storage:Scion 12  639.9   53.32   0.857  0.595
# Residuals          39 2427.9   62.25               
# 
# $PC14
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2    7.9    3.93   0.109 0.8970  
# Storage             1    6.4    6.45   0.179 0.6749  
# Scion               6  168.8   28.14   0.780 0.5909  
# Site:Storage        2  114.4   57.19   1.585 0.2179  
# Site:Scion         12  346.5   28.88   0.800 0.6480  
# Storage:Scion       6  509.5   84.92   2.353 0.0491 *
# Site:Storage:Scion 12  625.9   52.15   1.445 0.1875  
# Residuals          39 1407.4   36.09                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC15
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    4.0    2.02   0.055  0.946
# Storage             1   13.7   13.67   0.375  0.544
# Scion               6  131.7   21.94   0.601  0.728
# Site:Storage        2   15.1    7.55   0.207  0.814
# Site:Scion         12  560.8   46.74   1.280  0.269
# Storage:Scion       6  322.9   53.81   1.474  0.212
# Site:Storage:Scion 12  503.5   41.96   1.149  0.352
# Residuals          39 1423.7   36.51               
# 
# $PC16
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    1.2    0.62   0.017  0.983
# Storage             1   36.3   36.33   1.007  0.322
# Scion               6  152.9   25.49   0.707  0.646
# Site:Storage        2  143.2   71.62   1.986  0.151
# Site:Scion         12  535.6   44.64   1.238  0.294
# Storage:Scion       6  349.3   58.22   1.615  0.169
# Site:Storage:Scion 12  189.8   15.82   0.439  0.937
# Residuals          39 1406.4   36.06               
# 
# $PC17
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    4.5    2.26   0.058  0.944
# Storage             1   20.5   20.49   0.527  0.472
# Scion               6  218.1   36.35   0.935  0.481
# Site:Storage        2   99.3   49.66   1.277  0.290
# Site:Scion         12  351.3   29.27   0.753  0.692
# Storage:Scion       6  163.4   27.23   0.700  0.651
# Site:Storage:Scion 12  344.9   28.75   0.739  0.705
# Residuals          39 1516.2   38.88               
# 
# $PC18
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   33.5   16.77   0.532  0.592
# Storage             1    6.3    6.27   0.199  0.658
# Scion               6  305.1   50.85   1.613  0.170
# Site:Storage        2   40.2   20.08   0.637  0.534
# Site:Scion         12  540.3   45.02   1.428  0.195
# Storage:Scion       6  299.6   49.94   1.584  0.178
# Site:Storage:Scion 12  225.2   18.77   0.595  0.833
# Residuals          39 1229.8   31.53               
# 
# $PC19
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    5.4    2.68   0.081  0.922
# Storage             1    1.4    1.44   0.043  0.836
# Scion               6  246.4   41.06   1.244  0.306
# Site:Storage        2   12.7    6.35   0.192  0.826
# Site:Scion         12  316.6   26.39   0.799  0.649
# Storage:Scion       6  202.4   33.73   1.022  0.426
# Site:Storage:Scion 12  372.3   31.02   0.940  0.519
# Residuals          39 1287.3   33.01               
# 
# $PC20
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2   24.8   12.39   0.356  0.703
# Storage             1   23.8   23.81   0.683  0.414
# Scion               6  154.0   25.66   0.736  0.624
# Site:Storage        2   17.1    8.56   0.246  0.783
# Site:Scion         12  180.6   15.05   0.432  0.941
# Storage:Scion       6  146.0   24.33   0.698  0.653
# Site:Storage:Scion 12  501.6   41.80   1.199  0.318
# Residuals          39 1359.6   34.86

Percent variation in first 20 PCs for each factor

# Extract PC scores as a list of dataframes
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))

# Merge into single dataframe
pcs_factors_tidy <- lapply(
  names(pcas),
  function(name) mutate(
    pcas[[name]], 
    var = Sum.Sq / sum(pcas[[name]]$Sum.Sq) * 100,
    PC = substring(name, 3),
    Factor = gsub(" ", "", rownames(pcas[[name]]))
  )
) %>% bind_rows() %>% data.table()

# Adjust p values for FDR
pcs_factors_tidy$p.adj <- p.adjust(pcs_factors_tidy$`Pr..F.`, method = "BH")

# Significant factors
kable(pcs_factors_tidy[p.adj < 0.05, c("PC", "Factor", "Df", "F.value", "p.adj", "var")])
PC Factor Df F.value p.adj var
1 Site 2 238.165810 0.0000000 84.810927
1 Site:Storage 2 7.479562 0.0498485 2.663475
2 Site 2 62.212524 0.0000000 63.678672
2 Site:Storage 2 8.068552 0.0408951 8.258702
9 Site:Storage 2 12.982328 0.0022262 29.133773

PCA plot

fun_pca_plot <- plotOrd(
  fun_pca,
  colData(dds),
  design = "Site",
  shapes = "Storage",
  axes = c(1, 2),
  cbPalette = T,
  alpha = 0.75,
) # + facet_wrap(~facet) 
#   geom_line(aes(group=facet),alpha=0.25,linetype=3,colour="#000000") + 
#   theme(text = element_text(size=14))

ggsave(filename = "fun_pca_plot.png", plot = fun_pca_plot, path = "figures/")

fun_pca_plot

PCA sum of squares (% var)

sum_squares <- apply(mypca$x, 2 ,function(x) 
  summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
sum_squares <- do.call(cbind, sum_squares)
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# Site               Storage            Scion              Site:Storage       
#             38.022              1.000              3.083              4.878 
# Site:Scion         Storage:Scion      Site:Storage:Scion Residuals          
#              7.803              4.057              8.281             32.876

ADONIS

# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")

formula <- update(FULL_DESIGN, vg ~ .)

set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
# 
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
#                    Df SumOfSqs      R2       F   Pr(>F)    
# Site                2   6.5423 0.31527 18.6694 0.000999 ***
# Storage             1   0.4053 0.01953  2.3133 0.014985 *  
# Scion               6   0.9096 0.04383  0.8652 0.769231    
# Site:Storage        2   0.7206 0.03472  2.0562 0.005994 ** 
# Site:Scion         12   2.1487 0.10354  1.0219 0.424575    
# Storage:Scion       6   1.1647 0.05613  1.1079 0.278721    
# Site:Storage:Scion 12   2.0269 0.09768  0.9640 0.580420    
# Residual           39   6.8334 0.32930                     
# Total              80  20.7514 1.00000                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
#                    Df   SumOfSqs         R2          F      Pr..F.   Perc.Var
# Site                2  6.5423049 0.31527021 18.6694186 0.000999001  31.527021
# Storage             1  0.4053241 0.01953235  2.3133026 0.014985015   1.953235
# Scion               6  0.9095936 0.04383283  0.8652192 0.769230769   4.383283
# Site:Storage        2  0.7205681 0.03472380  2.0562460 0.005994006   3.472380
# Site:Scion         12  2.1486813 0.10354382  1.0219291 0.424575425  10.354382
# Storage:Scion       6  1.1646804 0.05612533  1.1078616 0.278721279   5.612533
# Site:Storage:Scion 12  2.0269039 0.09767543  0.9640108 0.580419580   9.767543
# Residual           39  6.8333647 0.32929623         NA          NA  32.929623
# Total              80 20.7514210 1.00000000         NA          NA 100.000000

NMDS ordination

set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0) 
#sratmax=20000,maxit=20000,try = 177, trymax = 177

fun_nmds <- scores(ord)

fun_nmds_plot <- plotOrd(
  fun_nmds, colData(dds), 
  design = "Site", 
  shape = "Storage", 
  alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))

ggsave(filename = "fun_nmds_plot.png", plot = fun_nmds_plot, path = "figures/")

fun_nmds_plot

ASV abundance

Explore distribution of ASV read counts

# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()

# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)

# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]

# Caculate cumulative percentage
cumulative <- data.frame(
  cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
  no = seq_along(total_asv_counts)
)

# Plot cumulative percentage of ASVs
fun_cum_asv <- ggline(
  data = cumulative, x = "no", y = "cumulative", 
  plot_type = "l", palette = cbPalette,
  title = "Cumulative percentage of fungal ASVs", xlab = "Number of ASVs", 
  ylab = "Cumulative percentage of reads"
)
ggsave(filename = "fun_cum_asv.png", plot = fun_cum_asv, path = "figures/")
fun_cum_asv

# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
  "Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
  "50%:", sum(cumulative <= 50), "\n",
  "80%:", sum(cumulative <= 80), "\n",
  "99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads 
# 
#  50%: 57 
#  80%: 140 
#  99%: 741
# Find the cumulative percentage accounted for by top x ASVs
cat(
  "Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
  "100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
  "200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
  "500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs: 
# 
#  100: 85.9 
#  200: 92.6 
#  500: 98.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]

# Plot read count distribution
fun_asv_counts <- ggline(
  data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
  x = "ASV", y = "counts", plot_type = "l",
  title = "Fungal ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "fun_asv_counts.png", plot = fun_asv_counts, path = "figures/")
fun_asv_counts

# Number of ASVs with mean read count > 100, 200, and 500
cat(
  "Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
  "100:", sum(rowMeans(asv_counts) > 100), "\n",
  "200:", sum(rowMeans(asv_counts) > 200), "\n",
  "500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500 
# 
#  100: 147 
#  200: 86 
#  500: 49

Filter top ASVs with mean read count > 100

# Filter the top x abundant OTUs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]

# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]

# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top OTUs
top_taxa <- taxData[rownames(top_asvs), ]

# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1)

top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)

# Check that sample names match
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
# Add sample metadata to top OTU data
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")

Effect of design factors on abundance of top ASVs

Effect of Site, Scion, and Storage on abundance of top 200 ASVs

# ANOVA of top ASVs
otu_lm_anova <- function(otu, formula, data) {
  f = update(formula, paste0("log(", otu, " + 1) ~ ."))
  a = aov(f, data = data) %>% summary() %>% unclass() %>% data.frame()
  total_var = sum(a$Sum.Sq)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    Abundance = mean(data[[otu]]),
    site_var = a['Site ', 'Sum.Sq'] / total_var * 100,
    site_p = a['Site ', 'Pr..F.'],
    scion_var = a['Scion', 'Sum.Sq'] / total_var * 100,
    scion_p = a['Scion', 'Pr..F.'],
    storage_var = a['Storage ', 'Sum.Sq'] / total_var * 100,
    storage_p = a['Storage ', 'Pr..F.'],
    site_scion_var = a['Site:Scion', 'Sum.Sq'] / total_var * 100,
    site_scion_p = a['Site:Scion', 'Pr..F.'],
    site_storage_var = a['Site:Storage ', 'Sum.Sq'] / total_var * 100,
    site_storage_p = a['Site:Storage ', 'Pr..F.'],
    storage_scion_var = a['Storage:Scion', 'Sum.Sq'] / total_var * 100,
    storage_scion_p = a['Storage:Scion', 'Pr..F.'],
    residuals_var = a['Residuals', 'Sum.Sq'] / total_var * 100
  )
  return(d)
}

# Negative binomial regression model
otu_negbin_anova <- function(otu, formula, data) {
  f = update(formula, paste0(otu, " ~ ."))
  m = glm.nb(f, data = data)
  a = anova(m, test = "Chisq") %>% data.frame()
  total_deviance = sum(a$Deviance, na.rm = T) + tail(a$Resid..Dev, 1)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    site_var = a['Site', 'Deviance'] / total_deviance * 100,
    site_p = a['Site', 'Pr..Chi.'],
    cultivar_var = a['Scion', 'Deviance'] / total_deviance * 100,
    cultivar_p = a['Scion', 'Pr..Chi.'],
    storage_var = a['Storage', 'Deviance'] / total_deviance * 100,
    storage_p = a['Storage', 'Pr..Chi.']
  )
  return(d)
}

formula <- FULL_DESIGN

# Full design model does not converge
# formula <- y ~ site + Scion + Storage + site:Scion + site:Storage

# formula <- DESIGN

# otu_lm_anova(top_asv_ids[1], formula, top_asv_data)

otu_anova_results <- sapply(
  top_asv_ids, function(x) otu_lm_anova(x, formula, top_asv_data)
) %>% t() %>% data.table()

otu_anova_results_adjusted <- otu_anova_results %>% 
  mutate(
    site_p = p.adjust(site_p, method = "BH"),
    scion_p = p.adjust(scion_p, method = "BH"),
    storage_p = p.adjust(storage_p, method = "BH"),
    site_scion_p = p.adjust(site_scion_p, method = "BH"),
    site_storage_p = p.adjust(site_storage_p, method = "BH"),
    storage_scion_p = p.adjust(storage_scion_p, method = "BH")
  )

cat(
  "Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
  "Site:", nrow(otu_anova_results_adjusted[site_p < 0.05, ]), "\n",
  "Scion:", nrow(otu_anova_results_adjusted[scion_p < 0.05, ]), "\n",
  "Storage:", nrow(otu_anova_results_adjusted[storage_p < 0.05, ]), "\n",
  "Site:Scion:", nrow(otu_anova_results_adjusted[site_scion_p < 0.05, ]), "\n",
  "Site:Storage:", nrow(otu_anova_results_adjusted[site_storage_p < 0.05, ]), "\n",
  "Storage:Scion:", nrow(otu_anova_results_adjusted[storage_scion_p < 0.05, ]), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values 
# 
#  Site: 136 
#  Scion: 0 
#  Storage: 1 
#  Site:Scion: 0 
#  Site:Storage: 36 
#  Storage:Scion: 0
otu_anova_results_adjusted[scion_p < 0.05, ]
# Empty data.table (0 rows and 16 cols): OTU,Taxonomy,Abundance,site_var,site_p,scion_var...
otu_anova_results_adjusted[site_scion_p < 0.05, ]
# Empty data.table (0 rows and 16 cols): OTU,Taxonomy,Abundance,site_var,site_p,scion_var...
otu_anova_results_adjusted[storage_scion_p < 0.05, ]
# Empty data.table (0 rows and 16 cols): OTU,Taxonomy,Abundance,site_var,site_p,scion_var...

Canker counts

Testing the effects of of total abundance, ASV abundance, α-diversity, and β-diversity on canker counts.

This uses a nested negative binomial regression model.

The base model for canker counts uses the formula: Cankers ~ Site * Storage * Scion.

# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]

# Base model
canker_design = "Cankers ~ Site * Storage * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)

# Abundance model
abundance_design = paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)

# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
Model theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
Site * Storage * Scion 2.828262 32 -546.8670 NA NA NA
Site * Storage * Scion + log(copy_number) 2.831854 31 -546.8376 1 vs 2 1 0.0293702 0.863927

Effect of ASV abundance on canker count

# Filter out samples with missing canker count
canker_top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]
# all.equal(top_asv_data[c("Site", "Storage", "Scion", "Cankers")],cankers)

# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = canker_top_asv_data)

# ANOVA of top ASVs with canker count
asv_canker_anova <- function(otu, design, base_model, data) {
  log_otu = paste0("log(", otu, " + 1)")
  f = paste(design, "+", log_otu)#, "+", log_otu, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    coef = m$coefficients[log_otu],
    var = b[log_otu, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))
OTU Taxonomy coef var p
log(OTU1 + 1) OTU1 Dactylonectria macrodidyma(s) 0.0184892 3.707987 0.9016206
# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
  top_asv_ids, 
  function(x) asv_canker_anova(x, canker_design, base_model, canker_top_asv_data)
) %>% t() %>% data.table()

# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(asv_canker_results[p_adjusted < 0.05, ]), "of", nrow(asv_canker_results),
  "ASVs have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 0 of 147 ASVs have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(asv_canker_results[p_adjusted < 0.05, ]) > 0) {
  asv_canker_results[p_adjusted < 0.05, ]
}

Effect of ASV abundance on canker count per site

site <- 3

canker_site_design <- "Cankers ~ Storage * Scion"

# Base model with ASV abundance data
base_model_1 <- glm.nb(canker_site_design, data = canker_top_asv_data[canker_top_asv_data$Site == site,])

# Effect of ASV abundance on canker count for top ASVs
asv_canker_results_1 <- sapply(
  top_asv_ids, function(x) asv_canker_anova(
    x, canker_site_design, base_model_1, 
    canker_top_asv_data[canker_top_asv_data$Site == site, ]
  )
) %>% t() %>% data.table()

# Adjust p-values for multiple testing
asv_canker_results_1$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(asv_canker_results_1[p_adjusted < 0.05, ]), "of", nrow(asv_canker_results_1),
  "ASVs have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 0 of 147 ASVs have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(asv_canker_results_1[p_adjusted < 0.05, ]) > 0) {
  asv_canker_results_1[p_adjusted < 0.05, ]
}

Effect of aggregated genera on canker counts

# Add genus from taxData to countData
fun_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
  genus = taxData[rownames(countData), "genus"]
)

# Group by genus
fun_genus_data <- fun_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()

# Set rownames as genus
rownames(fun_genus_data) <- fun_genus_data$genus
fun_genus_data <- dplyr::select(fun_genus_data, -genus)

# Filter genera with mean abundance < 100
fun_genus_data <- fun_genus_data[rowMeans(fun_genus_data) > 10, ]

# Rank not genus
not_genus <- rownames(fun_genus_data)[grep("\\([a-z]\\)", rownames(fun_genus_data))]
# Remove rows with genus in not_genus
fun_genus_data <- fun_genus_data[!rownames(fun_genus_data) %in% not_genus, ]
cat(
  length(not_genus), "non-genus ranks removed:\n\n",
  not_genus, "\n"
)
# 1 non-genus ranks removed:
# 
#  Clavicipitaceae(f)
# Final genus list
fun_genera <- rownames(fun_genus_data)

# Transpose and add metadata from colData
fun_genus_data <- t(fun_genus_data) %>% as.data.frame() %>% mutate(
  Site = colData[rownames(.), "Site"],
  Storage = colData[rownames(.), "Storage"],
  Scion = colData[rownames(.), "Scion"],
  Cankers = colData[rownames(.), "Cankers"]
)

# Filter out samples with missing canker count
fun_genus_data <- fun_genus_data[complete.cases(fun_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = fun_genus_data)

# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.table(
    Genus = genus,
    Abundance = mean(data[[genus]]),
    coef = m$coefficients[log_genus],
    var = b[log_genus, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# genus_canker_anova(fun_genera[1], canker_design, base_model, fun_genus_data)

# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
  fun_genera, 
  function(x) genus_canker_anova(x, canker_design, base_model, fun_genus_data)
) %>% t() %>% data.table()

# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")

# Summary of genera with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
  "genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 3 of 197 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
  genus_canker_results[p_adjusted < 0.05, ]
}
#         Genus Abundance      coef        var            p   p_adjusted
# 1: Corynascus  29.95441 0.4553139 0.06740052 0.0004488953 0.0337803378
# 2: Juxtiphoma  10.55033 0.5576448   3.793486 3.410562e-06 0.0006718808
# 3:    Lepiota   26.4591 0.3081053  0.2728229 0.0005144214 0.0337803378
sig_genus <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

for (genus in sig_genus) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(canker_design, "+", log_genus)
  m = glm.nb(f, data = fun_genus_data)
  print(anova(base_model, m))
}
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                                          Model    theta Resid. df
# 1                       Site * Storage * Scion 2.828262        32
# 2 Site * Storage * Scion + log(Corynascus + 1) 3.479988        31
#      2 x log-lik.   Test    df LR stat.      Pr(Chi)
# 1       -546.8670                                   
# 2       -534.5502 1 vs 2     1 12.31682 0.0004488953
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                                          Model    theta Resid. df
# 1                       Site * Storage * Scion 2.828262        32
# 2 Site * Storage * Scion + log(Juxtiphoma + 1) 4.208062        31
#      2 x log-lik.   Test    df LR stat.      Pr(Chi)
# 1       -546.8670                                   
# 2       -525.2965 1 vs 2     1 21.57051 3.410562e-06
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                                       Model    theta Resid. df    2 x log-lik.
# 1                    Site * Storage * Scion 2.828262        32       -546.8670
# 2 Site * Storage * Scion + log(Lepiota + 1) 3.532200        31       -534.8043
#     Test    df LR stat.      Pr(Chi)
# 1                                   
# 2 1 vs 2     1 12.06265 0.0005144214
Plot of genus abundance against canker count
significant_genera <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

significant_genera_data <- fun_genus_data[, c(significant_genera, FACTORS, "Cankers")]

# Melt data for ggplot
significant_genera_data <- significant_genera_data %>% reshape2::melt(
  id.vars = c(FACTORS, "Cankers"), variable.name = "Genus", value.name = "Abundance"
)

# Log transform abundance
significant_genera_data$log10_abundance <- log(significant_genera_data$Abundance + 1)
# significant_genera_data$log10_cankers <- log(significant_genera_data$Cankers + 1)

# Plot of genus abundance against canker count
fun_genus_canker_plot <- ggscatter(
  data = significant_genera_data, x = "log10_abundance", y = "Cankers", 
  color = "Site", shape = "Storage",
  xlab = "Genus abundance (log10)", ylab = "Canker count",
  free_x = T, free_y = T, palette = cbPalette
) + facet_wrap(~ Genus, scales = "free")

ggsave(
  filename = "fun_genus_canker_plot.png", plot = fun_genus_canker_plot, path = "figures/",
  height = 10, width = 20, units = "cm"
)

Effect of α-diversity on canker count

# ANOVA of α-diversity with canker count

# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)

measures <- c("S.chao1", "shannon", "simpson")

# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    measure = measure,
    coef = m$coefficients[measure],
    var = b[measure, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# alpha_canker_anova("shannon", all_alpha_ord)

# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))

alpha_canker_results
#    measure        coef        var          p
# 1: S.chao1 0.003337705 0.03842385 0.08796031
# 2: shannon -0.08300265  0.2109241  0.7883279
# 3: simpson   -4.031448   1.819761 0.04187015
# ANOVA results
for (measure in measures) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = all_alpha_ord)
  print(anova(base_model, m))
}
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.828262        32       -546.8670       
# 2 Site * Storage * Scion + S.chao1 2.934597        31       -543.9557 1 vs 2
#      df LR stat.    Pr(Chi)
# 1                          
# 2     1 2.911312 0.08796031
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.828262        32       -546.8670       
# 2 Site * Storage * Scion + shannon 2.836456        31       -546.7949 1 vs 2
#      df   LR stat.   Pr(Chi)
# 1                           
# 2     1 0.07208289 0.7883279
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.828262        32       -546.8670       
# 2 Site * Storage * Scion + simpson 3.056752        31       -542.7265 1 vs 2
#      df LR stat.    Pr(Chi)
# 1                          
# 2     1 4.140446 0.04187015

Effect of β-diversity on canker count

no_pcs <- 4

# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>% 
  column_to_rownames("Row.names")

pcs <- tail(colnames(pc_scores), no_pcs)

# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)

# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
  f = paste0(canker_design, "+", pc)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    PC = pc,
    coef = m$coefficients[pc],
    var = b[pc, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))

beta_canker_results
#     PC         coef        var          p
# 1: PC1   0.01281376   4.654705  0.3209296
# 2: PC2 -0.002516435 0.05130682  0.7117345
# 3: PC3    0.0119703  0.1146305   0.108615
# 4: PC4   0.02633142   1.147527 0.06763427
# Save environment
save.image("FUN.RData")

Bacteria

# Unpack bacteria data
invisible(mapply(assign, names(ubiome_BAC), ubiome_BAC, MoreArgs = list(envir = globalenv())))

OTU and sample summary

Read and sample summary

cat(
  "Raw reads", "\n\n",
  "Total raw reads:\t\t", sum(countData), "\n",
  "Mean raw reads per sample:\t", mean(colSums(countData)), "\n",
  "Median raw reads per sample:\t", median(colSums(countData)), "\n",
  "Max raw reads per sample:\t", max(colSums(countData)), "\n",
  "Min raw reads per sample:\t", min(colSums(countData)), "\n\n"
)
# Raw reads 
# 
#  Total raw reads:      3365816 
#  Mean raw reads per sample:    41046.54 
#  Median raw reads per sample:  40406.5 
#  Max raw reads per sample:     89023 
#  Min raw reads per sample:     15049
#colSums(countData)

nct <- counts(dds, normalize = T)
cat("Normalised reads", "\n\n",
  "Total normalised reads:\t\t", sum(nct), "\n",
  "Mean normalised reads per sample:\t", mean(colSums(nct)), "\n",
  "Median normalised reads per sample:\t", median(colSums(nct)), "\n",
  "Min normalised reads per sample:\t", min(colSums(nct)), "\n",
  "Max normalised reads per sample:\t", max(colSums(nct)), "\n\n"
)
# Normalised reads 
# 
#  Total normalised reads:       4585124 
#  Mean normalised reads per sample:     55916.14 
#  Median normalised reads per sample:   53407.32 
#  Min normalised reads per sample:  9940.045 
#  Max normalised reads per sample:  139399.1
#round(colSums(counts(dds,normalize = T)),0)

OTU summary

cat(
  "Total OTUs:\t\t", nrow(taxData),"\n\n",
  "Raw reads per OTU summary", "\n\n",
  "Mean raw reads per OTU:\t", mean(rowSums(countData)),"\n",
  "Median raw per OTU:\t\t", median(rowSums(countData)),"\n",
  "OTU raw Min reads:\t\t", min(rowSums(countData)),"\n",
  "OTU raw Max reads:\t\t", max(rowSums(countData)),"\n\n"
)
# Total OTUs:        5883 
# 
#  Raw reads per OTU summary 
# 
#  Mean raw reads per OTU:   572.1258 
#  Median raw per OTU:       120 
#  OTU raw Min reads:        34 
#  OTU raw Max reads:        106398
cat(
  "Normalised reads per OTU summary","\n\n",
  "Mean normalised reads per OTU:\t\t", mean(rowSums(nct)),"\n",
  "Median normalised reads per OTU:\t", median(rowSums(nct)),"\n",
  "OTU normalised Min reads:\t\t", min(rowSums(nct)),"\n",
  "OTU normalised Max reads:\t\t", max(rowSums(nct)),"\n\n"
)
# Normalised reads per OTU summary 
# 
#  Mean normalised reads per OTU:        779.3853 
#  Median normalised reads per OTU:  156.6744 
#  OTU normalised Min reads:         20.59746 
#  OTU normalised Max reads:         139400.3
y <- rowSums(nct)
y <- y[order(y, decreasing = T)]
# proportion
xy <- y/sum(y)

cat("Top ", TOPOTU, "OTUs:\n")
# Top  10 OTUs:
data.frame(counts = y[1:TOPOTU], proportion = xy[1:TOPOTU], rank = taxData[names(y)[1:TOPOTU],]$rank)
#          counts  proportion               rank
# OTU1  139400.30 0.030402734    Streptomyces(g)
# OTU2  124366.87 0.027123994  Kineosporiales(o)
# OTU3  113814.51 0.024822559 Kineosporiaceae(f)
# OTU4   99723.88 0.021749441    Streptomyces(g)
# OTU5   96979.25 0.021150846    Streptomyces(g)
# OTU6   79618.94 0.017364621    Streptomyces(g)
# OTU7   51308.93 0.011190305  Bradyrhizobium(g)
# OTU8   49535.82 0.010803594    Actinoplanes(g)
# OTU20  33817.55 0.007375494  Actinobacteria(c)
# OTU9   33580.42 0.007323775      Nonomuraea(g)

Taxonomy Summary

Taxonomy identifiable

Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank

# Proportion of OTUs which can be assigned (with the given confidence) at each taxonomic rank

tx <- copy(taxData)
setDT(tx)
cols <- names(tx)[9:15]

tx[, (cols) := lapply(.SD, as.factor), .SDcols = cols]

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.8) / nrow(tx))), 2),
  "0.65" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.65) / nrow(tx))), 2),
  "0.5" = round(unlist(lapply(cols, function(col) sum(as.number(tx[[col]]) >= 0.5) / nrow(tx))), 2)
)
#       rank  0.8 0.65  0.5
# 1: kingdom 1.00 1.00 1.00
# 2:  phylum 0.94 0.97 0.99
# 3:   class 0.84 0.90 0.93
# 4:   order 0.65 0.72 0.79
# 5:  family 0.51 0.57 0.64
# 6:   genus 0.35 0.43 0.51
# 7: species 0.00 0.00 0.00

% of reads which can be assigned to each taxonomic ranks

tx <-taxData[rownames(dds),]
nc <- counts(dds, normalize = T)
ac <- sum(nc)

data.table(
  rank = c("kingdom", "phylum", "class", "order", "family", "genus", "species"),
  "0.8" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.8),]) / ac * 100))), 2),
  "0.65" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.65),]) / ac * 100))), 2),
  "0.5" = round(unlist(lapply(cols, function(col)(sum(nc[which(as.numeric(tx[[col]]) >= 0.5),]) / ac * 100))), 2)
)
#       rank   0.8   0.65    0.5
# 1: kingdom 99.95 100.00 100.00
# 2:  phylum 97.78  99.43  99.75
# 3:   class 92.63  95.82  98.05
# 4:   order 71.84  81.25  87.03
# 5:  family 58.63  67.80  73.01
# 6:   genus 44.35  51.20  58.33
# 7: species  0.00   0.00   0.00

Taxonomy plots

Plots of proportion of normalised reads assigned to members of phylum and class.

dat <- list(as.data.frame(counts(dds, normalize = T)), taxData, as.data.frame(colData(dds)))

design <- c("Site", "Storage")

# md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, cutoff = 0.1)
md1 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "phylum", cutoff = 0.1)

md1[, Site := factor(Site, levels = c(1, 2, 3))]
md1[, Storage := factor(Storage, levels = c("no", "yes"))]
md1[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md1[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md1 <- md1[!taxon %in% removals, ]

bac_phylum_plot <- plotfun1(md1, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/bac_phylum.png", bac_phylum_plot, width = 25, height = 15, units = "cm")

bac_phylum_plot

md2 <- getSummedTaxa(dat, conf = TAXCONF, design = design, taxon = "class", cutoff = 0.1, topn = 9)

md2[, Site := factor(Site, levels = c(1, 2, 3))]
md2[, Storage := factor(Storage, levels = c("no", "yes"))]
md2[, taxon := factor(taxon, levels = unique(taxon[order(value, decreasing = T)]))]

removals <- md2[, .(value = mean(value)), by = "taxon"][value < 0.5, taxon]
md2 <- md2[!taxon %in% removals, ]

bac_class_plot <- plotfun1(md2, x = "taxon", fill = "Site") +
  facet_wrap(~ Storage)

ggsave("figures/bac_class.png", bac_class_plot, width = 25, height = 15, units = "cm")

bac_class_plot

Abundance

abundance_plot <- ggplot(
  data = as.data.frame(colData(dds)), 
  aes(x = Site, y = log_copy_number, colour = Scion, shape = Storage)
) + geom_jitter() + 
  scale_colour_manual(values = cbPalette)

abundance_plot <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Bacterial abundance", xlab = "Site", ylab = "log10 copy number"
)

ggsave(
  filename = "bac_abundance.png", plot = abundance_plot, path = "figures/", 
  height = 20, width = 20, units = "cm"
)

abundance_plot

# Formula for ANOVA
formula <- update(FULL_DESIGN, log_copy_number ~ .)

abundance_anova <- aov(formula, data = as.data.frame(colData(dds)))

# Normality check
par(mfrow = c(2, 2))
plot(abundance_anova)

png("figures/bac_abundance_norm.png", width = 800, height = 600)
par(mfrow = c(2, 2))
plot(abundance_anova)
dev.off()
# png 
#   2
# Results
summary(abundance_anova)
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2 1.9657  0.9828  25.302 7.91e-08 ***
# Storage             1 0.0798  0.0798   2.056   0.1594    
# Scion               6 0.5131  0.0855   2.202   0.0628 .  
# Site:Storage        2 0.0768  0.0384   0.989   0.3809    
# Site:Scion         12 0.2677  0.0223   0.574   0.8494    
# Storage:Scion       6 0.1640  0.0273   0.704   0.6484    
# Site:Storage:Scion 12 0.1889  0.0157   0.405   0.9530    
# Residuals          40 1.5538  0.0388                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
abundance_results <- abundance_anova %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(abundance_results$Sum.Sq)
abundance_results$Perc.Var <- abundance_results$Sum.Sq / total_variance * 100

abundance_results
#                    Df     Sum.Sq    Mean.Sq    F.value       Pr..F.  Perc.Var
# Site                2 1.96566003 0.98283002 25.3015018 7.913123e-08 40.867065
# Storage             1 0.07984549 0.07984549  2.0555037 1.594283e-01  1.660028
# Scion               6 0.51312816 0.08552136  2.2016206 6.280900e-02 10.668194
# Site:Storage        2 0.07683263 0.03841631  0.9889711 3.808702e-01  1.597389
# Site:Scion         12 0.26772071 0.02231006  0.5743394 8.494197e-01  5.566049
# Storage:Scion       6 0.16398521 0.02733087  0.7035927 6.484006e-01  3.409335
# Site:Storage:Scion 12 0.18892649 0.01574387  0.4053027 9.530223e-01  3.927877
# Residuals          40 1.55378922 0.03884473         NA           NA 32.304063

Alpha diversity analysis

Alpha diversity plot

# plot alpha diversity - plot_alpha will convert normalised abundances to integer values

bac_alpha_plot <- plot_alpha(
  counts(dds,normalize = F), colData(dds),
  design = "Scion", colour = "Site",
  measures = c("Shannon", "Simpson"),
  type="box"
) + 
  scale_colour_manual(values = cbPalette) + 
  theme(axis.title.x =  element_blank()) + 
  ggtitle("Bacterial α-diversity")

abundance_plot <- ggboxplot(
  data = as.data.frame(colData(dds)), x = "Site", y = "log_copy_number", 
  color = "Scion", add = "jitter", legend = "top", 
  title = "Bacterial abundance", xlab = "Site", ylab = "log10 copy number"
)

ggsave(
  filename = "bac_alpha.png", plot = bac_alpha_plot, path = "figures/", 
  height = 20, width = 40, units = "cm"
)

bac_alpha_plot

Permutation based anova on diversity index ranks

# get the diversity index data
all_alpha_ord <- plot_alpha(
  counts(dds, normalize = F), colData(dds), design = "Site", returnData = T
)

# join diversity indices and metadata
all_alpha_ord <- all_alpha_ord[
  as.data.table(colData(dds), keep.rownames = "Samples"), on = "Samples"
]

bac_alpha <- all_alpha_ord

formula <- FULL_DESIGN

Chao1

setkey(all_alpha_ord, S.chao1)
all_alpha_ord[, measure := as.numeric(as.factor(S.chao1))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  20914.4   10457.2 5000   <2e-16 ***
# Storage             1    113.5     113.5  110   0.4818    
# Site:Storage        2   3319.8    1659.9 5000   0.0094 ** 
# Scion               6   1291.7     215.3  407   0.5037    
# Site:Scion         12   5425.8     452.1 2201   0.1577    
# Storage:Scion       6    867.4     144.6  699   0.8054    
# Site:Storage:Scion 12   1912.4     159.4  551   0.9456    
# Residuals          40  12095.5     302.4                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df   R.Sum.Sq  R.Mean.Sq Iter  Pr.Prob.   Perc.Var
# Site                2 20914.3677 10457.1839 5000 0.0000000 45.5249023
# Storage             1   113.5359   113.5359  110 0.4818182  0.2471368
# Site:Storage        2  3319.8162  1659.9081 5000 0.0094000  7.2263388
# Scion               6  1291.7022   215.2837  407 0.5036855  2.8116852
# Site:Scion         12  5425.7655   452.1471 2201 0.1576556 11.8104189
# Storage:Scion       6   867.3693   144.5615  699 0.8054363  1.8880275
# Site:Storage:Scion 12  1912.4432   159.3703  551 0.9455535  4.1628699
# Residuals          40 12095.5000   302.3875   NA        NA 26.3286207

Shannon

setkey(all_alpha_ord, shannon)
all_alpha_ord[, measure := as.numeric(as.factor(shannon))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  23583.2   11791.6 5000   <2e-16 ***
# Storage             1     89.3      89.3  120   0.4583    
# Site:Storage        2    898.4     449.2  714   0.2983    
# Scion               6    994.0     165.7  165   0.7939    
# Site:Scion         12   3435.7     286.3 5000   0.7100    
# Storage:Scion       6    481.5      80.3   84   1.0000    
# Site:Storage:Scion 12   2143.3     178.6 2635   0.9450    
# Residuals          40  14315.0     357.9                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df    R.Sum.Sq   R.Mean.Sq Iter  Pr.Prob.   Perc.Var
# Site                2 23583.23942 11791.61971 5000 0.0000000 51.3343116
# Storage             1    89.26865    89.26865  120 0.4583333  0.1943136
# Site:Storage        2   898.39852   449.19926  714 0.2983193  1.9555698
# Scion               6   994.03480   165.67247  165 0.7939394  2.1637440
# Site:Scion         12  3435.70147   286.30846 5000 0.7100000  7.4785896
# Storage:Scion       6   481.54603    80.25767   84 1.0000000  1.0481950
# Site:Storage:Scion 12  2143.31111   178.60926 2635 0.9449715  4.6654066
# Residuals          40 14315.00000   357.87500   NA        NA 31.1598698

Simpson

setkey(all_alpha_ord, simpson)
all_alpha_ord[, measure := as.numeric(as.factor(simpson))]
result <- aovp(update(formula, measure ~ .), all_alpha_ord, seqs = T)
# [1] "Settings:  sequential SS "
summary(result)
# Component 1 :
#                    Df R Sum Sq R Mean Sq Iter Pr(Prob)    
# Site                2  20851.3   10425.7 5000   <2e-16 ***
# Storage             1     31.0      31.0  163   0.3804    
# Site:Storage        2   1422.3     711.2 1690   0.1118    
# Scion               6   1995.8     332.6  609   0.3924    
# Site:Scion         12   3933.8     327.8 1536   0.4915    
# Storage:Scion       6   1003.5     167.3  677   0.5968    
# Site:Storage:Scion 12   3012.3     251.0 3909   0.7135    
# Residuals          40  13690.5     342.3                  
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% summary() %>% unclass() %>% data.frame()
total_variance <- sum(df$R.Sum.Sq)
df$Perc.Var <- df$R.Sum.Sq / total_variance * 100
df
#                    Df    R.Sum.Sq   R.Mean.Sq Iter  Pr.Prob.    Perc.Var
# Site                2 20851.30423 10425.65212 5000 0.0000000 45.38763016
# Storage             1    30.96912    30.96912  163 0.3803681  0.06741138
# Site:Storage        2  1422.32005   711.16002 1690 0.1118343  3.09600472
# Scion               6  1995.82345   332.63724  609 0.3924466  4.34436598
# Site:Scion         12  3933.80635   327.81720 1536 0.4915365  8.56282877
# Storage:Scion       6  1003.51689   167.25281  677 0.5967504  2.18438391
# Site:Storage:Scion 12  3012.25990   251.02166 3909 0.7134817  6.55687225
# Residuals          40 13690.50000   342.26250   NA        NA 29.80050282

Beta diversity PCA/NMDS

PCA

# Number of PCs to include
n_pcs <- 20

# perform PC decomposition of DES object
mypca <- des_to_pca(dds)

# to get pca plot axis into the same scale create a dataframe of PC scores multiplied by their variance
bac_pca <- t(data.frame(t(mypca$x) * mypca$percentVar))

formula = FULL_DESIGN

Percent variation in first 20 PCs

# Cumulative percentage of variance explained
pca_cum_var <- data.frame(
  cumulative = cumsum(mypca$percentVar * 100),
  no = 1:length(mypca$percentVar)
)

# Plot cumulative percentage of variance explained
bac_cum_pca <- ggline(
  pca_cum_var, x = "no", y = "cumulative", plot_type = "l",
  xlab = "Number of PCs", ylab = "Cumulative % variance explained",
  title = "Bacteria: cumulative % variance explained by PCs"
)
ggsave(filename = "bac_cum_pca.png", plot = bac_cum_pca, path = "figures/",)
bac_cum_pca

pca_var <- data.frame(
  row.names = paste0("PC", 1:n_pcs),
  perc_var = round(mypca$percentVar[1:n_pcs] * 100, 1)
)

pca_var
#      perc_var
# PC1      18.6
# PC2      12.7
# PC3       7.1
# PC4       4.0
# PC5       3.0
# PC6       2.1
# PC7       1.8
# PC8       1.6
# PC9       1.3
# PC10      1.3
# PC11      1.3
# PC12      1.2
# PC13      1.2
# PC14      1.1
# PC15      1.1
# PC16      1.1
# PC17      1.0
# PC18      1.0
# PC19      1.0
# PC20      1.0

ANOVA of first 20 PCs

pca_summary <- apply(
  mypca$x[, 1:n_pcs], 2, 
  function(x){
    summary(aov(update(formula, x ~ .), data = as.data.frame(cbind(x, colData(dds)))))
  }
)

pca_summary
# $PC1
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 190646   95323 224.072 <2e-16 ***
# Storage             1     25      25   0.058 0.8106    
# Scion               6   4227     704   1.656 0.1572    
# Site:Storage        2   2566    1283   3.015 0.0603 .  
# Site:Scion         12   8401     700   1.646 0.1176    
# Storage:Scion       6   1527     255   0.598 0.7298    
# Site:Storage:Scion 12   5124     427   1.004 0.4634    
# Residuals          40  17016     425                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC2
#                    Df Sum Sq Mean Sq F value Pr(>F)    
# Site                2 137668   68834 276.644 <2e-16 ***
# Storage             1    142     142   0.572  0.454    
# Scion               6   1474     246   0.987  0.447    
# Site:Storage        2    604     302   1.213  0.308    
# Site:Scion         12   4425     369   1.482  0.172    
# Storage:Scion       6    711     119   0.476  0.822    
# Site:Storage:Scion 12   2035     170   0.681  0.759    
# Residuals          40   9953     249                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC3
#                    Df Sum Sq Mean Sq F value   Pr(>F)    
# Site                2   2673    1337   1.562 0.222149    
# Storage             1     30      30   0.035 0.851920    
# Scion               6  13665    2278   2.663 0.028714 *  
# Site:Storage        2  14748    7374   8.621 0.000771 ***
# Site:Scion         12  10740     895   1.046 0.428331    
# Storage:Scion       6   8140    1357   1.586 0.176467    
# Site:Storage:Scion 12   2870     239   0.280 0.989466    
# Residuals          40  34217     855                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC4
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2   5281  2640.3   5.834 0.00598 **
# Storage             1    635   634.6   1.402 0.24336   
# Scion               6   3161   526.9   1.164 0.34456   
# Site:Storage        2    630   315.1   0.696 0.50442   
# Site:Scion         12  13692  1141.0   2.521 0.01418 * 
# Storage:Scion       6   1827   304.5   0.673 0.67221   
# Site:Storage:Scion 12   6439   536.6   1.186 0.32589   
# Residuals          40  18104   452.6                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC5
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2      5       2   0.006 0.99447   
# Storage             1   4033    4033   9.433 0.00382 **
# Scion               6   1458     243   0.568 0.75299   
# Site:Storage        2   5967    2984   6.979 0.00251 **
# Site:Scion         12   2582     215   0.503 0.90010   
# Storage:Scion       6   1984     331   0.773 0.59554   
# Site:Storage:Scion 12   3480     290   0.678 0.76149   
# Residuals          40  17102     428                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC6
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     32    16.2   0.055 0.9467  
# Storage             1   1498  1497.8   5.070 0.0299 *
# Scion               6   4754   792.3   2.682 0.0278 *
# Site:Storage        2    811   405.3   1.372 0.2653  
# Site:Scion         12   2040   170.0   0.575 0.8486  
# Storage:Scion       6   1724   287.4   0.973 0.4560  
# Site:Storage:Scion 12   3166   263.8   0.893 0.5611  
# Residuals          40  11817   295.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC7
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     85    42.7   0.157 0.8556  
# Storage             1    448   448.2   1.645 0.2070  
# Scion               6   2285   380.9   1.398 0.2393  
# Site:Storage        2   2000   999.9   3.670 0.0344 *
# Site:Scion         12   2462   205.1   0.753 0.6925  
# Storage:Scion       6    983   163.9   0.602 0.7273  
# Site:Storage:Scion 12   3081   256.7   0.942 0.5166  
# Residuals          40  10898   272.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC8
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2    110    55.1   0.197  0.822
# Storage             1    549   548.5   1.963  0.169
# Scion               6    750   125.1   0.448  0.842
# Site:Storage        2    334   167.1   0.598  0.555
# Site:Scion         12   1153    96.1   0.344  0.975
# Storage:Scion       6   1856   309.4   1.108  0.375
# Site:Storage:Scion 12   3233   269.4   0.964  0.497
# Residuals          40  11175   279.4               
# 
# $PC9
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    291   145.4   0.624 0.54101   
# Storage             1    120   120.0   0.515 0.47731   
# Scion               6    658   109.6   0.470 0.82619   
# Site:Storage        2   3474  1737.2   7.452 0.00177 **
# Site:Scion         12   1215   101.2   0.434 0.93963   
# Storage:Scion       6    606   101.0   0.433 0.85225   
# Site:Storage:Scion 12    970    80.9   0.347 0.97417   
# Residuals          40   9324   233.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC10
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2    131    65.4   0.396 0.67572   
# Storage             1    365   365.5   2.213 0.14466   
# Scion               6   1060   176.7   1.070 0.39635   
# Site:Storage        2   2202  1100.8   6.666 0.00317 **
# Site:Scion         12   2043   170.2   1.031 0.44086   
# Storage:Scion       6   1947   324.4   1.965 0.09381 . 
# Site:Storage:Scion 12   1980   165.0   0.999 0.46711   
# Residuals          40   6605   165.1                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC11
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2     15    7.62   0.030  0.970
# Storage             1     84   84.49   0.332  0.568
# Scion               6   1487  247.76   0.974  0.455
# Site:Storage        2    117   58.45   0.230  0.796
# Site:Scion         12   1745  145.45   0.572  0.851
# Storage:Scion       6    838  139.64   0.549  0.768
# Site:Storage:Scion 12   1543  128.57   0.506  0.899
# Residuals          40  10172  254.30               
# 
# $PC12
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2     28   13.98   0.075  0.928
# Storage             1     58   57.94   0.310  0.581
# Scion               6   1149  191.52   1.025  0.423
# Site:Storage        2    474  237.00   1.268  0.292
# Site:Scion         12   1493  124.43   0.666  0.773
# Storage:Scion       6   1331  221.83   1.187  0.333
# Site:Storage:Scion 12   2905  242.10   1.296  0.259
# Residuals          40   7474  186.84               
# 
# $PC13
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     81    40.5   0.255 0.7763  
# Storage             1    309   308.7   1.945 0.1708  
# Scion               6    635   105.8   0.666 0.6771  
# Site:Storage        2    794   397.2   2.502 0.0947 .
# Site:Scion         12   4123   343.6   2.164 0.0338 *
# Storage:Scion       6    828   138.0   0.869 0.5258  
# Site:Storage:Scion 12   1113    92.8   0.584 0.8416  
# Residuals          40   6350   158.8                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC14
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     90    44.9   0.296 0.7457  
# Storage             1     13    13.1   0.086 0.7707  
# Scion               6    536    89.3   0.587 0.7382  
# Site:Storage        2   1555   777.6   5.113 0.0105 *
# Site:Scion         12   1200   100.0   0.657 0.7802  
# Storage:Scion       6   1495   249.2   1.638 0.1619  
# Site:Storage:Scion 12   2975   247.9   1.630 0.1219  
# Residuals          40   6084   152.1                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC15
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2     36   18.17   0.098  0.907
# Storage             1      9    8.77   0.047  0.829
# Scion               6   1477  246.09   1.330  0.267
# Site:Storage        2    469  234.74   1.269  0.292
# Site:Scion         12   1713  142.75   0.771  0.675
# Storage:Scion       6    753  125.46   0.678  0.668
# Site:Storage:Scion 12   1654  137.80   0.745  0.700
# Residuals          40   7402  185.04               
# 
# $PC16
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     28    14.1   0.079 0.9242  
# Storage             1   1096  1095.9   6.138 0.0175 *
# Scion               6    475    79.2   0.444 0.8451  
# Site:Storage        2    815   407.7   2.284 0.1150  
# Site:Scion         12    534    44.5   0.249 0.9936  
# Storage:Scion       6    377    62.9   0.352 0.9045  
# Site:Storage:Scion 12   2909   242.4   1.358 0.2263  
# Residuals          40   7141   178.5                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC17
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2      1    0.29   0.002  0.998
# Storage             1     38   37.87   0.231  0.634
# Scion               6    683  113.87   0.694  0.656
# Site:Storage        2    315  157.64   0.961  0.391
# Site:Scion         12   1540  128.30   0.782  0.665
# Storage:Scion       6   1359  226.52   1.380  0.246
# Site:Storage:Scion 12   2442  203.46   1.240  0.291
# Residuals          40   6564  164.09               
# 
# $PC18
#                    Df Sum Sq Mean Sq F value Pr(>F)  
# Site                2     72    36.1   0.277 0.7599  
# Storage             1    168   168.2   1.290 0.2628  
# Scion               6    851   141.8   1.087 0.3868  
# Site:Storage        2    271   135.6   1.040 0.3629  
# Site:Scion         12   2714   226.2   1.734 0.0955 .
# Storage:Scion       6   1954   325.6   2.496 0.0381 *
# Site:Storage:Scion 12   1523   126.9   0.973 0.4897  
# Residuals          40   5217   130.4                 
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# $PC19
#                    Df Sum Sq Mean Sq F value Pr(>F)
# Site                2     61   30.31   0.191  0.827
# Storage             1     16   16.12   0.101  0.752
# Scion               6    977  162.91   1.026  0.423
# Site:Storage        2    170   84.95   0.535  0.590
# Site:Scion         12   2170  180.83   1.138  0.358
# Storage:Scion       6    799  133.10   0.838  0.548
# Site:Storage:Scion 12   1504  125.30   0.789  0.659
# Residuals          40   6354  158.85               
# 
# $PC20
#                    Df Sum Sq Mean Sq F value  Pr(>F)   
# Site                2     16     8.0   0.075 0.92807   
# Storage             1    150   150.0   1.397 0.24423   
# Scion               6   2350   391.6   3.647 0.00557 **
# Site:Storage        2    155    77.5   0.721 0.49224   
# Site:Scion         12   1810   150.8   1.405 0.20414   
# Storage:Scion       6    976   162.6   1.515 0.19831   
# Site:Storage:Scion 12   2139   178.2   1.660 0.11382   
# Residuals          40   4296   107.4                   
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Percent variation in first 20 PCs for each factor

# Extract PC scores as a list of dataframes
pcas <- lapply(pca_summary, function(i) data.frame(unclass(i)))

# Merge into single dataframe
pcs_factors_tidy <- lapply(
  names(pcas),
  function(name) mutate(
    pcas[[name]], 
    var = Sum.Sq / sum(pcas[[name]]$Sum.Sq) * 100,
    PC = substring(name, 3),
    Factor = gsub(" ", "", rownames(pcas[[name]]))
  )
) %>% bind_rows() %>% data.table()

# Adjust p values for FDR
pcs_factors_tidy$p.adj <- p.adjust(pcs_factors_tidy$`Pr..F.`, method = "BH")

# Significant factors
kable(pcs_factors_tidy[p.adj < 0.05, c("PC", "Factor", "Df", "F.value", "p.adj", "var")])
PC Factor Df F.value p.adj var
1 Site 2 224.072105 0.00000 83.05869
2 Site 2 276.643717 0.00000 87.68034
3 Site:Storage 2 8.620505 0.03598 16.93559

PCA plot

bac_pca_plot <- plotOrd(
  bac_pca,
  colData(dds),
  design = "Site",
  shape = "Storage",
  axes = c(1, 2),
  # facet = "Storage", 
  cbPalette = T,
  alpha = 0.75,
) #+ facet_wrap(~facet)

ggsave(filename = "bac_pca_plot.png", plot = bac_pca_plot, path = "figures/")

bac_pca_plot

bac_pca_2_3_plot <- plotOrd(
  bac_pca,
  colData(dds),
  design = "Scion",
  shape = "Site",
  axes = c(2, 3), 
  cbPalette = T,
  alpha = 0.75,
)

ggsave(filename = "bac_pca_2_3_plot.png", plot = bac_pca_2_3_plot, path = "figures/")

PCA sum of squares (% var)

sum_squares <- apply(mypca$x, 2 ,function(x) 
  summary(aov(update(formula, x ~ .), data = cbind(x, colData(dds))))[[1]][2]
)
sum_squares <- do.call(cbind, sum_squares)
x <- t(apply(sum_squares, 2, prop.table))
perVar <- x * mypca$percentVar
#colSums(perVar)
round(colSums(perVar) / sum(colSums(perVar)) * 100, 3)
# Site               Storage            Scion              Site:Storage       
#             27.377              1.163              6.201              3.608 
# Site:Scion         Storage:Scion      Site:Storage:Scion Residuals          
#             11.077              5.262              9.804             35.509

ADONIS

# Calculate Bray-Curtis distance matrix
vg <- vegdist(t(counts(dds, normalize = T)), method = "bray")

formula <- update(FULL_DESIGN, vg ~ .)

set.seed(sum(utf8ToInt("Hamish McLean")))
result <- adonis2(formula, colData(dds), permutations = 1000)
result
# Permutation test for adonis under reduced model
# Terms added sequentially (first to last)
# Permutation: free
# Number of permutations: 1000
# 
# adonis2(formula = formula, data = colData(dds), permutations = 1000)
#                    Df SumOfSqs      R2       F   Pr(>F)    
# Site                2   4.6757 0.31606 19.4091 0.000999 ***
# Storage             1   0.2240 0.01514  1.8596 0.056943 .  
# Scion               6   0.9812 0.06633  1.3577 0.062937 .  
# Site:Storage        2   0.7189 0.04859  2.9841 0.002997 ** 
# Site:Scion         12   1.4891 0.10066  1.0302 0.406593    
# Storage:Scion       6   0.6866 0.04641  0.9500 0.556444    
# Site:Storage:Scion 12   1.2000 0.08112  0.8302 0.891109    
# Residual           40   4.8180 0.32569                     
# Total              81  14.7934 1.00000                     
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- result %>% data.frame()
df$Perc.Var <- df$SumOfSqs / df["Total", "SumOfSqs"] * 100
df
#                    Df   SumOfSqs         R2          F      Pr..F.   Perc.Var
# Site                2  4.6756525 0.31606401 19.4090620 0.000999001  31.606401
# Storage             1  0.2239849 0.01514090  1.8595639 0.056943057   1.514090
# Scion               6  0.9812079 0.06632754  1.3576946 0.062937063   6.632754
# Site:Storage        2  0.7188806 0.04859478  2.9841392 0.002997003   4.859478
# Site:Scion         12  1.4890925 0.10065944  1.0302266 0.406593407  10.065944
# Storage:Scion       6  0.6865717 0.04641076  0.9500073 0.556443556   4.641076
# Site:Storage:Scion 12  1.1999721 0.08111552  0.8301990 0.891108891   8.111552
# Residual           40  4.8180097 0.32568705         NA          NA  32.568705
# Total              81 14.7933719 1.00000000         NA          NA 100.000000

NMDS ordination

set.seed(sum(utf8ToInt("Hamish McLean")))
ord <- metaMDS(vg,trace=0) 
#sratmax=20000,maxit=20000,try = 177, trymax = 177

bac_nmds <- scores(ord)

bac_nmds_plot <- plotOrd(
  bac_nmds, colData(dds), 
  design = "Site", 
  shape = "Storage", 
  alpha = 0.75, cbPalette = T
) #+ theme(text = element_text(size = 14))

ggsave(filename = "fun_nmds_plot.png", plot = bac_nmds_plot, path = "figures/")

bac_nmds_plot

ASV abundance

Explore distribution of ASV counts

# Extract normalised counts from DESeq object
asv_counts <- counts(dds, normalize = T) %>% as.data.frame()

# Sum ASV counts across samples
total_asv_counts <- rowSums(asv_counts)

# Sort ASVs by abundance
total_asv_counts <- total_asv_counts[order(total_asv_counts, decreasing = T)]

# Caculate cumulative percentage
cumulative <- data.frame(
  cumulative = cumsum(total_asv_counts) / sum(total_asv_counts) * 100,
  no = seq_along(total_asv_counts)
)

# Plot cumulative percentage of ASVs
bac_cum_asv <- ggline(
  data = cumulative, x = "no", y = "cumulative", 
  plot_type = "l", palette = cbPalette,
  title = "Cumulative percentage of bacterial ASVs", xlab = "Number of ASVs", 
  ylab = "Cumulative percentage of reads"
)
ggsave(filename = "bac_cum_asv.png", plot = bac_cum_asv, path = "figures/")
bac_cum_asv

# Find the number of ASVs that account for 50%, 80%, and 99% of total reads
cat(
  "Number of ASVs that account for 50%, 80%, and 99% of total reads", "\n\n",
  "50%:", sum(cumulative <= 50), "\n",
  "80%:", sum(cumulative <= 80), "\n",
  "99%:", sum(cumulative <= 99), "\n"
)
# Number of ASVs that account for 50%, 80%, and 99% of total reads 
# 
#  50%: 205 
#  80%: 1055 
#  99%: 5037
# Find the cumulative percentage accounted for by top x ASVs
cat(
  "Percentage of total reads accounted for by the top 100, 200,and 500 ASVs:", "\n\n",
  "100:", round(cumulative[cumulative$no == 100, "cumulative"], 1) , "\n",
  "200:", round(cumulative[cumulative$no == 200, "cumulative"], 1) , "\n",
  "500:", round(cumulative[cumulative$no == 500, "cumulative"], 1) , "\n"
)
# Percentage of total reads accounted for by the top 100, 200,and 500 ASVs: 
# 
#  100: 43.7 
#  200: 53.9 
#  500: 69.1
# Average ASV counts in order
mean_asv_counts <- rowMeans(asv_counts)
mean_asv_counts <- mean_asv_counts[order(mean_asv_counts, decreasing = T)]

# Plot read count distribution
bac_asv_counts <- ggline(
  data = data.frame(ASV = seq_along(mean_asv_counts), counts = mean_asv_counts),
  x = "ASV", y = "counts", plot_type = "l",
  title = "Bacterial ASV read count distribution", xlab = "ASV", ylab = "Mean read count"
)
ggsave(filename = "bac_asv_counts.png", plot = bac_asv_counts, path = "figures/")
bac_asv_counts

# Number of ASVs with mean read count > 100, 200, and 500
cat(
  "Number of ASVs with mean read count > 100, 200, and 500", "\n\n",
  "100:", sum(rowMeans(asv_counts) > 100), "\n",
  "200:", sum(rowMeans(asv_counts) > 200), "\n",
  "500:", sum(rowMeans(asv_counts) > 500), "\n"
)
# Number of ASVs with mean read count > 100, 200, and 500 
# 
#  100: 71 
#  200: 32 
#  500: 8

Filter top ASVs with mean read count > 100

# Filter the top x abundant OTUs by the sum of their normalised counts
# top_asvs <- asv_counts[order(rowSums(asv_counts), decreasing = T)[1:DIFFOTU], ]

# Filter ASVs with mean read count > 100
top_asvs <- asv_counts[rowMeans(asv_counts) > 100, ]

# Check that sample names match
identical(names(top_asvs), rownames(colData))
# [1] TRUE
# Extract taxonomic data for top OTUs
top_taxa <- taxData[rownames(top_asvs), ]

# Log transform normalised counts
# top_asvs <- log10(top_asvs + 1) # Log transform in models instead

top_asv_data <- data.frame(t(top_asvs))
top_asv_ids <- rownames(top_asvs)
identical(rownames(top_asv_data), rownames(colData))
# [1] TRUE
top_asv_data <- merge(top_asv_data, colData, by = 0) %>% column_to_rownames("Row.names")

Effect of design factors on top ASVs

Effect of Site, Scion, and Storage on abundance of top 200 ASVs

# ANOVA of top ASVs
otu_lm_anova <- function(otu, formula, data) {
  f = update(formula, paste0("log(", otu, " + 1) ~ ."))
  a = aov(f, data = data) %>% summary() %>% unclass() %>% data.frame()
  total_var = sum(a$Sum.Sq)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    Abundance = mean(data[[otu]]),
    site_var = a['Site ', 'Sum.Sq'] / total_var * 100,
    site_p = a['Site ', 'Pr..F.'],
    scion_var = a['Scion', 'Sum.Sq'] / total_var * 100,
    scion_p = a['Scion', 'Pr..F.'],
    storage_var = a['Storage ', 'Sum.Sq'] / total_var * 100,
    storage_p = a['Storage ', 'Pr..F.'],
    site_scion_var = a['Site:Scion', 'Sum.Sq'] / total_var * 100,
    site_scion_p = a['Site:Scion', 'Pr..F.'],
    site_storage_var = a['Site:Storage ', 'Sum.Sq'] / total_var * 100,
    site_storage_p = a['Site:Storage ', 'Pr..F.'],
    storage_scion_var = a['Storage:Scion', 'Sum.Sq'] / total_var * 100,
    storage_scion_p = a['Storage:Scion', 'Pr..F.'],
    residuals_var = a['Residuals', 'Sum.Sq'] / total_var * 100
  )
  return(d)
}

# Negative binomial regression model
otu_negbin_anova <- function(otu, formula, data) {
  f = update(formula, paste0(otu, " ~ ."))
  m = glm.nb(f, data = data)
  a = anova(m, test = "Chisq") %>% data.frame()
  total_deviance = sum(a$Deviance, na.rm = T) + tail(a$Resid..Dev, 1)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    site_var = a['Site', 'Deviance'] / total_deviance * 100,
    site_p = a['Site', 'Pr..Chi.'],
    scion_var = a['Scion', 'Deviance'] / total_deviance * 100,
    scion_p = a['Scion', 'Pr..Chi.'],
    storage_var = a['Storage', 'Deviance'] / total_deviance * 100,
    storage_p = a['Storage', 'Pr..Chi.']
  )
  return(d)
}

formula <- FULL_DESIGN

otu_anova_results <- sapply(
  top_asv_ids, function(x) otu_lm_anova(x, formula, top_asv_data)
) %>% t() %>% data.table()

otu_anova_results_adjusted <- otu_anova_results %>% 
  mutate(
    site_p = p.adjust(site_p, method = "BH"),
    scion_p = p.adjust(scion_p, method = "BH"),
    storage_p = p.adjust(storage_p, method = "BH")
  )

cat(
  "Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values", "\n\n",
  "Site:", nrow(otu_anova_results_adjusted[site_p < 0.05, ]), "\n",
  "Scion:", nrow(otu_anova_results_adjusted[scion_p < 0.05, ]), "\n",
  "Storage:", nrow(otu_anova_results_adjusted[storage_p < 0.05, ]), "\n",
  "Site:Scion:", nrow(otu_anova_results_adjusted[site_scion_p < 0.05, ]), "\n",
  "Site:Storage:", nrow(otu_anova_results_adjusted[site_storage_p < 0.05, ]), "\n",
  "Storage:Scion:", nrow(otu_anova_results_adjusted[storage_scion_p < 0.05, ]), "\n\n"
)
# Number of ASVs with statistically significant (*P* < 0.05) adjusted p-values 
# 
#  Site: 68 
#  Scion: 6 
#  Storage: 0 
#  Site:Scion: 3 
#  Site:Storage: 50 
#  Storage:Scion: 2
otu_anova_results_adjusted[scion_p < 0.05, ]
#      OTU            Taxonomy Abundance site_var       site_p scion_var
# 1: OTU11     Actinoplanes(g)  362.5498 57.01493 1.077636e-13  8.779256
# 2: OTU17 Mycobacteriaceae(f)  336.7446 63.94552 2.523714e-16  8.075128
# 3: OTU28    Amycolatopsis(g)   305.052 57.08183 4.611778e-13  11.85351
# 4:  OTU7   Bradyrhizobium(g)  625.7187 13.22873 9.450589e-04  18.27876
# 5: OTU70        Kribbella(g)  102.5546 37.87329 4.859133e-10  11.17966
# 6: OTU83   Steroidobacter(g)  127.8547 1.078752 5.409558e-01  25.67554
#       scion_p storage_var storage_p site_scion_var site_scion_p
# 1: 0.04194029   0.8337958 0.9505991       3.047229    0.7409656
# 2: 0.02035916    0.150778 0.9505991       4.634767    0.1890384
# 3: 0.02035916    0.123614 0.9505991       3.306424    0.7658573
# 4: 0.04194029   0.5012272 0.9505991       11.13561    0.3192109
# 5: 0.04194029   0.2103298 0.9505991       7.135868    0.2595534
# 6: 0.02035916  0.01326388 0.9539102       13.74426    0.2430695
#    site_storage_var site_storage_p storage_scion_var storage_scion_p
# 1:         10.65662   1.630064e-05          2.795821       0.2855695
# 2:         5.824716   0.0001719382          2.541607       0.1787341
# 3:         3.976902      0.0128961          4.084545       0.1551837
# 4:           14.725    0.000424495          8.413546       0.1223116
# 5:         17.38269   1.649034e-06          4.745871       0.1409803
# 6:         9.945972    0.006369712          9.190067       0.1297052
#    residuals_var
# 1:      14.49169
# 2:      10.73637
# 3:      16.36471
# 4:      31.04119
# 5:      18.37531
# 6:      34.57801
otu_anova_results_adjusted[site_scion_p < 0.05, ]
#      OTU              Taxonomy Abundance site_var       site_p scion_var
# 1: OTU27 Micromonosporaceae(f)  175.7534 43.52383 1.608145e-11  5.694322
# 2: OTU30       Streptomyces(g)  194.7991 51.85274 2.149921e-11  5.906322
# 3:  OTU5       Streptomyces(g)  1182.674 43.70563 7.650936e-11  3.363344
#      scion_p storage_var storage_p site_scion_var site_scion_p site_storage_var
# 1: 0.1315376   0.3820512 0.9505991       11.76138   0.01829337         10.92462
# 2: 0.1678927    2.254467 0.6911820       11.93521   0.04910443         4.422486
# 3: 0.4265673   0.1340044 0.9505991        13.1956   0.01973321         13.96242
#    site_storage_p storage_scion_var storage_scion_p residuals_var
# 1:   3.376895e-05          2.136838       0.5197268      16.22546
# 2:     0.01768693          2.226376       0.6128633      19.78407
# 3:   1.270369e-05          2.664643        0.462205      18.44128
otu_anova_results_adjusted[storage_scion_p < 0.05, ]
#      OTU         Taxonomy Abundance site_var       site_p scion_var    scion_p
# 1: OTU38 Mycobacterium(g)  159.5277 37.98721 1.099712e-08  11.27463 0.05950441
# 2:  OTU8  Actinoplanes(g)  604.0953 62.80041 4.193815e-17  4.119011 0.06706237
#    storage_var storage_p site_scion_var site_scion_p site_storage_var
# 1:  0.04875582 0.9505991       7.425487    0.4346678          2.49345
# 2:   0.8239168 0.8162242       4.148908    0.1593185         11.20464
#    site_storage_p storage_scion_var storage_scion_p residuals_var
# 1:      0.1367133          9.065908      0.03560492      23.83539
# 2:   1.110335e-07          6.314704     0.001203779      9.131464

Canker counts

Testing the effects of of ASV abundance, α-diversity, and β-diversity on canker counts.

This uses a nested negative binomial regression model.

The base model for canker counts uses the formula: Cankers ~ Site * Storage * Scion.

# Filter out samples with missing canker count
canker_abundance_data <- colData[complete.cases(colData$Cankers), ]

# Base model
canker_design = "Cankers ~ Site * Storage * Scion"
base_model <- glm.nb(canker_design, data = canker_abundance_data)

# Abundance model
abundance_design = paste(canker_design, "+ log(copy_number)")
abundance_model <- glm.nb(abundance_design, data = canker_abundance_data)

# ANOVA of abundance with canker count
kable(anova(base_model, abundance_model))
Model theta Resid. df 2 x log-lik. Test df LR stat. Pr(Chi)
Site * Storage * Scion 2.882191 33 -554.6118 NA NA NA
Site * Storage * Scion + log(copy_number) 2.980708 32 -551.9342 1 vs 2 1 2.677645 0.1017661

Effect of ASV abundance on canker count

# Filter out samples with missing canker count
top_asv_data <- top_asv_data[complete.cases(top_asv_data$Cankers), ]

# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with ASV abundance data
base_model <- glm.nb(canker_design, data = top_asv_data)

# ANOVA of top ASVs with canker count
asv_canker_anova <- function(otu, design, base_model, data) {
  log_otu = paste0("log(", otu, " + 1)")
  f = paste(design, "+", log_otu)#, "+", log_otu, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    OTU = otu,
    Taxonomy = taxData[otu, "rank"],
    coef = m$coefficients[log_otu],
    var = b[log_otu, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# kable(asv_canker_anova(top_asv_ids[1], canker_design, base_model, canker_top_asv_data))

# Effect of ASV abundance on canker count for top ASVs
asv_canker_results <- sapply(
  top_asv_ids, 
  function(x) asv_canker_anova(x, canker_design, base_model, top_asv_data)
) %>% t() %>% data.table()

# Adjust p-values for multiple testing
asv_canker_results$p_adjusted <- p.adjust(asv_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(asv_canker_results[p_adjusted < 0.05, ]), 
  "ASVs have statistically significant (*P* < 0.05) adjusted p-values"
)
# 1 ASVs have statistically significant (*P* < 0.05) adjusted p-values
asv_canker_results[p_adjusted < 0.05, ]
#      OTU           Taxonomy       coef      var            p p_adjusted
# 1: OTU40 Mycobacteriales(o) -0.5353163 1.070081 0.0004620826 0.03280786

Effect of aggregated genera on canker counts

# Add genus from taxData to countData
bac_genus_data <- counts(dds, normalize = T) %>% as.data.frame() %>% mutate(
  genus = taxData[rownames(countData), "genus"]
)

# Group by genus and set genus to rownames
bac_genus_data <- bac_genus_data %>% group_by(genus) %>% summarise_all(sum) %>% as.data.frame()

# Set rownames as genus
rownames(bac_genus_data) <- bac_genus_data$genus
bac_genus_data <- dplyr::select(bac_genus_data, -genus)

# Filter genera with mean abundance < 100
bac_genus_data <- bac_genus_data[rowMeans(bac_genus_data) > 10, ]

# Rank not genus
not_genus <- rownames(bac_genus_data)[grep("\\([a-z]\\)", rownames(bac_genus_data))]
# Remove rows with genus in not_genus
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% not_genus, ]
cat(
  length(not_genus), "non-genus ranks removed:\n\n",
  not_genus, "\n"
)
# 0 non-genus ranks removed:
# 
# 
# Remove any with slashes or underscores
removals <- rownames(bac_genus_data)[grep("[/_]", rownames(bac_genus_data))]
# Remove rows with symbols in the name
bac_genus_data <- bac_genus_data[!rownames(bac_genus_data) %in% removals, ]
cat(
  length(removals), "ranks with slashes or underscores removed:\n\n",
  removals, "\n"
)
# 4 ranks with slashes or underscores removed:
# 
#  Armatimonas/Armatimonadetes_gp1 Candidatus_Solibacter Saccharibacteria_genera_incertae_sedis Subdivision3_genera_incertae_sedis
# Set rownames as genus
bac_genera <- rownames(bac_genus_data)

# Transpose and add metadata from colData
bac_genus_data <- t(bac_genus_data) %>% as.data.frame() %>% mutate(
  Site = colData[rownames(.), "Site"],
  Storage = colData[rownames(.), "Storage"],
  Scion = colData[rownames(.), "Scion"],
  Cankers = colData[rownames(.), "Cankers"]
)

# Filter out samples with missing canker count
bac_genus_data <- bac_genus_data[complete.cases(bac_genus_data$Cankers), ]
# Base model design
canker_design = "Cankers ~ Site * Storage * Scion"

# Base model with genus abundance data
base_model <- glm.nb(canker_design, data = bac_genus_data)

# ANOVA of genus abundance with canker count
genus_canker_anova <- function(genus, design, base_model, data) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(design, "+", log_genus)#, "+", log_genus, ":Site")
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = suppressWarnings(anova(m)) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.table(
    Genus = genus,
    Abundance = mean(data[[genus]]),
    coef = m$coefficients[log_genus],
    var = b[log_genus, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# genus_canker_anova(bac_genera[1], canker_design, base_model, bac_genus_data)

# Effect of genera abundance on canker counts
genus_canker_results <- sapply(
  bac_genera, 
  function(x) genus_canker_anova(x, canker_design, base_model, bac_genus_data)
) %>% t() %>% data.table()

# Adjust p-values for multiple testing
genus_canker_results$p_adjusted <- p.adjust(genus_canker_results$p, method = "BH")

# Summary of ASVs with statistically significant (*P* < 0.05) adjusted p-values
cat(
  nrow(genus_canker_results[p_adjusted < 0.05, ]), "of", nrow(genus_canker_results),
  "genera have statistically significant (*P* < 0.05) adjusted p-values\n\n"
)
# 2 of 238 genera have statistically significant (*P* < 0.05) adjusted p-values
if(nrow(genus_canker_results[p_adjusted < 0.05, ]) > 0) {
  genus_canker_results[p_adjusted < 0.05, ]
}
#               Genus Abundance       coef      var            p   p_adjusted
# 1: Marisediminicola  11.01665 -0.7908458 1.442856 1.264145e-05 1.504332e-03
# 2:    Pedomicrobium  22.65009  0.7736918 1.317756 3.761193e-07 8.951639e-05
sig_genus <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

for (genus in sig_genus) {
  log_genus = paste0("log(", genus, " + 1)")
  f = paste(canker_design, "+", log_genus)
  m = glm.nb(f, data = bac_genus_data)
  print(anova(base_model, m))
}
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                                                Model    theta Resid. df
# 1                             Site * Storage * Scion 2.882191        33
# 2 Site * Storage * Scion + log(Marisediminicola + 1) 3.878220        32
#      2 x log-lik.   Test    df LR stat.      Pr(Chi)
# 1       -554.6118                                   
# 2       -535.5479 1 vs 2     1 19.06389 1.264145e-05
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                                             Model    theta Resid. df
# 1                          Site * Storage * Scion 2.882191        33
# 2 Site * Storage * Scion + log(Pedomicrobium + 1) 4.520792        32
#      2 x log-lik.   Test    df LR stat.      Pr(Chi)
# 1       -554.6118                                   
# 2       -528.7987 1 vs 2     1 25.81314 3.761193e-07
Plot of genus abundance against canker count
significant_genera <- genus_canker_results[p_adjusted < 0.05]$Genus %>% unlist()

significant_genera_data <- bac_genus_data[, c(significant_genera, FACTORS, "Cankers")]

# Melt data for ggplot
significant_genera_data <- significant_genera_data %>% reshape2::melt(
  id.vars = c(FACTORS, "Cankers"), variable.name = "Genus", value.name = "Abundance"
)

# Log transform abundance
significant_genera_data$log10_abundance <- log10(significant_genera_data$Abundance + 1)

# Plot of genus abundance against canker count
bac_genus_canker_plot <- ggscatter(
  data = significant_genera_data, x = "log10_abundance", y = "Cankers", 
  color = "Site", shape = "Storage",
  xlab = "Genus abundance (log10)", ylab = "Canker count",
  free_x = T, free_y = T, palette = cbPalette
) + facet_wrap(~ Genus, scales = "free")

ggsave(
  filename = "bac_genus_canker_plot.png", plot = bac_genus_canker_plot, path = "figures/",
  height = 10, width = 20, units = "cm"
)

Effect of α-diversity on canker count

# ANOVA of α-diversity with canker count

# Base model with α-diversity data
base_model <- glm.nb(canker_design, data = all_alpha_ord)

measures <- c("S.chao1", "shannon", "simpson")

# ANOVA of α-diversity with canker count
alpha_canker_anova <- function(measure, data) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    measure = measure,
    coef = m$coefficients[measure],
    var = b[measure, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of α-diversity on canker count for each measure
alpha_canker_results <- data.table(t(sapply(measures, function(x) alpha_canker_anova(x, all_alpha_ord))))

alpha_canker_results
#    measure         coef       var          p
# 1: S.chao1 0.0003260492 0.1949192  0.3631458
# 2: shannon     1.047428  3.043469 0.03415997
# 3: simpson     74.20115  2.554056 0.01184907
# ANOVA results
for (measure in measures) {
  f = paste(canker_design, "+", measure)
  m = glm.nb(f, data = all_alpha_ord)
  print(anova(base_model, m))
}
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.882191        33       -554.6118       
# 2 Site * Storage * Scion + S.chao1 2.904996        32       -553.7848 1 vs 2
#      df  LR stat.   Pr(Chi)
# 1                          
# 2     1 0.8269849 0.3631458
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.882191        33       -554.6118       
# 2 Site * Storage * Scion + shannon 3.049289        32       -550.1251 1 vs 2
#      df LR stat.    Pr(Chi)
# 1                          
# 2     1 4.486679 0.03415997
# Likelihood ratio tests of Negative Binomial Models
# 
# Response: Cankers
#                              Model    theta Resid. df    2 x log-lik.   Test
# 1           Site * Storage * Scion 2.882191        33       -554.6118       
# 2 Site * Storage * Scion + simpson 3.136392        32       -548.2785 1 vs 2
#      df LR stat.    Pr(Chi)
# 1                          
# 2     1 6.333314 0.01184907

Effect of β-diversity on canker count

no_pcs <- 4

# Merge PC scores with canker data
pc_scores <- merge(colData, data.frame(mypca$x[, 1:no_pcs]), by = "row.names") %>% 
  column_to_rownames("Row.names")

pcs <- tail(colnames(pc_scores), no_pcs)

# Base model with β-diversity data
base_model <- glm.nb(canker_design, data = pc_scores)

# ANOVA of β-diversity with canker count
beta_canker_anova <- function(pc, data) {
  f = paste0(canker_design, "+", pc)
  m = glm.nb(f, data = data)
  a = anova(base_model, m) %>% data.frame()
  b = anova(m) %>% data.frame()
  total_deviance = sum(b$Deviance, na.rm = T) + tail(b$Resid..Dev, 1)
  d = data.frame(
    PC = pc,
    coef = m$coefficients[pc],
    var = b[pc, 'Deviance'] / total_deviance * 100,
    p = a[2, 'Pr.Chi.']
  )
  return(d)
}

# Effect of β-diversity on canker count for each PC
beta_canker_results <- data.table(t(sapply(pcs, function(x) beta_canker_anova(x, pc_scores))))

beta_canker_results
#     PC         coef        var            p
# 1: PC1  -0.02475561  0.2245606 0.0001345953
# 2: PC2 -0.007178287 0.02763263    0.3213615
# 3: PC3 -0.003666729  0.3145477    0.4859172
# 4: PC4   0.01570856  0.2983212  0.009036866

Extra figures

Abundance

abundance_combined <- rbind(
  as.data.frame(colData(ubiome_FUN$dds)), as.data.frame(colData(ubiome_BAC$dds))
) %>% mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria")) %>%
  mutate(kingdom = factor(kingdom, levels = c("Fungi", "Bacteria"))) %>%
  mutate(Storage = factor(Storage, levels = c("no", "yes")))

# abundance_bar <- ggbarplot(
#   data = abundance_combined, x = "Storage", y = "log_copy_number", 
#   fill = "Site", add = "mean_se", facet.by = "kingdom",
#   palette = cbPalette, position = position_dodge(0.8),
#   ylab = "Mean copy number (log10)", xlab = F, legend = "right"
# ) + guides(fill = guide_legend(title = "Site"))

# ggsave(
#   filename = "abundance_bar.png", plot = abundance_bar, path = "figures/", 
#   height = 12, width = 24, units = "cm"
# )

# abundance_bar

abundance_box <- ggboxplot(
  data = abundance_combined, x = "Site", y = "log_copy_number", 
  color = "Storage", add = "jitter", facet.by = "kingdom",
  palette = cbPalette, legend = "bottom",
  ylab = "Mean copy number (log10)", xlab = "Site"
) 

ggsave(
  filename = "abundance_box.png", plot = abundance_box, path = "figures/", 
  height = 12, width = 24, units = "cm"
)

abundance_box

Alpha diversity

alpha_combined <- rbind(fun_alpha, bac_alpha) %>% 
  subset(select = c("Site", "Storage", "Scion", "Target", "S.chao1", "shannon", "simpson")) %>%
  mutate(kingdom = ifelse(Target == "ITS", "Fungi", "Bacteria")) %>%
  mutate(kingdom = factor(kingdom, levels = c("Fungi", "Bacteria"))) %>%
  mutate(Storage = factor(Storage, levels = c("no", "yes"))) %>%
  rename(Chao1 = S.chao1, Shannon = shannon, Simpson = simpson) %>%
  pivot_longer(
    cols = c("Chao1", "Shannon", "Simpson"), names_to = "measure", values_to = "value"
  )
# Error in rename(., Chao1 = S.chao1, Shannon = shannon, Simpson = simpson): unused arguments (Chao1 = S.chao1, Shannon = shannon, Simpson = simpson)
# alpha_bar <- ggbarplot(
#   data = alpha_combined, x = "Storage", y = "value", 
#   fill = "Site", add = "mean_se", facet.by = c("measure", "kingdom"), 
#   palette = cbPalette, position = position_dodge(0.8), scales = "free_y",
#   ylab = "Mean diversity index", xlab = F, legend = "right"
# ) + guides(fill = guide_legend(title = "Site"))

# ggsave(
#   filename = "alpha_bar.png", plot = alpha_bar, path = "figures/", 
#   height = 12, width = 24, units = "cm"
# )

# alpha_bar

alpha_box <- ggboxplot(
  data = alpha_combined, x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = c("measure", "kingdom"),
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
) #+ guides(color = guide_legend(position = "right"))
# Error in ggboxplot(data = alpha_combined, x = "Site", y = "value", color = "Storage", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box.png", plot = alpha_box, path = "figures/", 
  height = 24, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box' not found
alpha_box
# Error in eval(expr, envir, enclos): object 'alpha_box' not found
alpha_box_fungi <- ggboxplot(
  data = alpha_combined[alpha_combined$kingdom == "Fungi", ], x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = "measure",
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
)
# Error in ggboxplot(data = alpha_combined[alpha_combined$kingdom == "Fungi", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box_fungi.png", plot = alpha_box_fungi, path = "figures/", 
  height = 12, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box_fungi' not found
alpha_box_bacteria <- ggboxplot(
  data = alpha_combined[alpha_combined$kingdom == "Bacteria", ], x = "Site", y = "value", 
  color = "Storage", add = "jitter", facet.by = "measure",
  palette = cbPalette, scales = "free_y", legend = "bottom",
  ylab = "Mean diversity index", xlab = "Site"
)
# Error in ggboxplot(data = alpha_combined[alpha_combined$kingdom == "Bacteria", : object 'alpha_combined' not found
ggsave(
  filename = "alpha_box_bacteria.png", plot = alpha_box_bacteria, path = "figures/", 
  height = 12, width = 24, units = "cm"
)
# Error in plot_theme(plot): object 'alpha_box_bacteria' not found

PCA

pca_combo_plot <- ggarrange(
  fun_pca_plot, bac_pca_plot,
  ncol = 2, nrow = 1, labels = c("A", "B"),
  common.legend = T, legend = "right"
)

ggsave(
  filename = "pca_combo_plot.png", plot = pca_combo_plot, path = "figures/", 
  height = 18, width = 25, units = "cm"
)

pca_combo_plot

nmds_combo_plot <- ggarrange(
  fun_nmds_plot, bac_nmds_plot,
  ncol = 2, nrow = 1, labels = c("A", "B"),
  common.legend = T, legend = "right"
)

ggsave(
  filename = "nmds_combo_plot.png", plot = nmds_combo_plot, path = "figures/", 
  height = 18, width = 25, units = "cm"
)

nmds_combo_plot

mega_combo_plot <- ggarrange(
  fun_pca_plot, bac_pca_plot,
  fun_nmds_plot, bac_nmds_plot,
  ncol = 2, nrow = 2, labels = c("A", "B", "C", "D"),
  common.legend = T, legend = "bottom"
) + labs(shape = "Storage")

ggsave(
  filename = "mega_combo_plot.png", plot = mega_combo_plot, path = "figures/", 
  height = 25, width = 30, units = "cm"
)

mega_combo_plot

# Save environment
save.image("BAC.RData")